libstdc++
regex_compiler.h
Go to the documentation of this file.
00001 // class template regex -*- C++ -*-
00002 
00003 // Copyright (C) 2010-2019 Free Software Foundation, Inc.
00004 //
00005 // This file is part of the GNU ISO C++ Library.  This library is free
00006 // software; you can redistribute it and/or modify it under the
00007 // terms of the GNU General Public License as published by the
00008 // Free Software Foundation; either version 3, or (at your option)
00009 // any later version.
00010 
00011 // This library is distributed in the hope that it will be useful,
00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00014 // GNU General Public License for more details.
00015 
00016 // Under Section 7 of GPL version 3, you are granted additional
00017 // permissions described in the GCC Runtime Library Exception, version
00018 // 3.1, as published by the Free Software Foundation.
00019 
00020 // You should have received a copy of the GNU General Public License and
00021 // a copy of the GCC Runtime Library Exception along with this program;
00022 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
00023 // <http://www.gnu.org/licenses/>.
00024 
00025 /**
00026  *  @file bits/regex_compiler.h
00027  *  This is an internal header file, included by other library headers.
00028  *  Do not attempt to use it directly. @headername{regex}
00029  */
00030 
00031 namespace std _GLIBCXX_VISIBILITY(default)
00032 {
00033 _GLIBCXX_BEGIN_NAMESPACE_VERSION
00034 _GLIBCXX_BEGIN_NAMESPACE_CXX11
00035 
00036   template<typename>
00037     class regex_traits;
00038 
00039 _GLIBCXX_END_NAMESPACE_CXX11
00040 
00041 namespace __detail
00042 {
00043   /**
00044    * @addtogroup regex-detail
00045    * @{
00046    */
00047 
00048   template<typename, bool, bool>
00049     struct _BracketMatcher;
00050 
00051   /**
00052    * @brief Builds an NFA from an input iterator range.
00053    *
00054    * The %_TraitsT type should fulfill requirements [28.3].
00055    */
00056   template<typename _TraitsT>
00057     class _Compiler
00058     {
00059     public:
00060       typedef typename _TraitsT::char_type        _CharT;
00061       typedef const _CharT*                       _IterT;
00062       typedef _NFA<_TraitsT>                      _RegexT;
00063       typedef regex_constants::syntax_option_type _FlagT;
00064 
00065       _Compiler(_IterT __b, _IterT __e,
00066                 const typename _TraitsT::locale_type& __traits, _FlagT __flags);
00067 
00068       shared_ptr<const _RegexT>
00069       _M_get_nfa()
00070       { return std::move(_M_nfa); }
00071 
00072     private:
00073       typedef _Scanner<_CharT>               _ScannerT;
00074       typedef typename _TraitsT::string_type _StringT;
00075       typedef typename _ScannerT::_TokenT    _TokenT;
00076       typedef _StateSeq<_TraitsT>            _StateSeqT;
00077       typedef std::stack<_StateSeqT>         _StackT;
00078       typedef std::ctype<_CharT>             _CtypeT;
00079 
00080       // accepts a specific token or returns false.
00081       bool
00082       _M_match_token(_TokenT __token);
00083 
00084       void
00085       _M_disjunction();
00086 
00087       void
00088       _M_alternative();
00089 
00090       bool
00091       _M_term();
00092 
00093       bool
00094       _M_assertion();
00095 
00096       bool
00097       _M_quantifier();
00098 
00099       bool
00100       _M_atom();
00101 
00102       bool
00103       _M_bracket_expression();
00104 
00105       template<bool __icase, bool __collate>
00106         void
00107         _M_insert_any_matcher_ecma();
00108 
00109       template<bool __icase, bool __collate>
00110         void
00111         _M_insert_any_matcher_posix();
00112 
00113       template<bool __icase, bool __collate>
00114         void
00115         _M_insert_char_matcher();
00116 
00117       template<bool __icase, bool __collate>
00118         void
00119         _M_insert_character_class_matcher();
00120 
00121       template<bool __icase, bool __collate>
00122         void
00123         _M_insert_bracket_matcher(bool __neg);
00124 
00125       // Returns true if successfully matched one term and should continue.
00126       // Returns false if the compiler should move on.
00127       template<bool __icase, bool __collate>
00128         bool
00129         _M_expression_term(pair<bool, _CharT>& __last_char,
00130                            _BracketMatcher<_TraitsT, __icase, __collate>&
00131                            __matcher);
00132 
00133       int
00134       _M_cur_int_value(int __radix);
00135 
00136       bool
00137       _M_try_char();
00138 
00139       _StateSeqT
00140       _M_pop()
00141       {
00142         auto ret = _M_stack.top();
00143         _M_stack.pop();
00144         return ret;
00145       }
00146 
00147       _FlagT              _M_flags;
00148       _ScannerT           _M_scanner;
00149       shared_ptr<_RegexT> _M_nfa;
00150       _StringT            _M_value;
00151       _StackT             _M_stack;
00152       const _TraitsT&     _M_traits;
00153       const _CtypeT&      _M_ctype;
00154     };
00155 
00156   template<typename _Tp>
00157     struct __is_contiguous_iter : is_pointer<_Tp>::type { };
00158 
00159   template<typename _Tp, typename _Cont>
00160     struct
00161     __is_contiguous_iter<__gnu_cxx::__normal_iterator<_Tp*, _Cont>>
00162     : true_type { };
00163 
00164   template<typename _Iter, typename _TraitsT>
00165     using __enable_if_contiguous_iter
00166       = __enable_if_t< __is_contiguous_iter<_Iter>::value,
00167                        std::shared_ptr<const _NFA<_TraitsT>> >;
00168 
00169   template<typename _Iter, typename _TraitsT>
00170     using __disable_if_contiguous_iter
00171       = __enable_if_t< !__is_contiguous_iter<_Iter>::value,
00172                        std::shared_ptr<const _NFA<_TraitsT>> >;
00173 
00174   template<typename _TraitsT, typename _FwdIter>
00175     inline __enable_if_contiguous_iter<_FwdIter, _TraitsT>
00176     __compile_nfa(_FwdIter __first, _FwdIter __last,
00177                   const typename _TraitsT::locale_type& __loc,
00178                   regex_constants::syntax_option_type __flags)
00179     {
00180       size_t __len = __last - __first;
00181       const auto* __cfirst = __len ? std::__addressof(*__first) : nullptr;
00182       using _Cmplr = _Compiler<_TraitsT>;
00183       return _Cmplr(__cfirst, __cfirst + __len, __loc, __flags)._M_get_nfa();
00184     }
00185 
00186   template<typename _TraitsT, typename _FwdIter>
00187     inline __disable_if_contiguous_iter<_FwdIter, _TraitsT>
00188     __compile_nfa(_FwdIter __first, _FwdIter __last,
00189                   const typename _TraitsT::locale_type& __loc,
00190                   regex_constants::syntax_option_type __flags)
00191     {
00192       const basic_string<typename _TraitsT::char_type> __str(__first, __last);
00193       return __compile_nfa<_TraitsT>(__str.data(), __str.data() + __str.size(),
00194                                      __loc, __flags);
00195     }
00196 
00197   // [28.13.14]
00198   template<typename _TraitsT, bool __icase, bool __collate>
00199     class _RegexTranslatorBase
00200     {
00201     public:
00202       typedef typename _TraitsT::char_type            _CharT;
00203       typedef typename _TraitsT::string_type          _StringT;
00204       typedef _StringT _StrTransT;
00205 
00206       explicit
00207       _RegexTranslatorBase(const _TraitsT& __traits)
00208       : _M_traits(__traits)
00209       { }
00210 
00211       _CharT
00212       _M_translate(_CharT __ch) const
00213       {
00214         if (__icase)
00215           return _M_traits.translate_nocase(__ch);
00216         else if (__collate)
00217           return _M_traits.translate(__ch);
00218         else
00219           return __ch;
00220       }
00221 
00222       _StrTransT
00223       _M_transform(_CharT __ch) const
00224       {
00225         _StrTransT __str(1, __ch);
00226         return _M_traits.transform(__str.begin(), __str.end());
00227       }
00228 
00229       // See LWG 523. It's not efficiently implementable when _TraitsT is not
00230       // std::regex_traits<>, and __collate is true. See specializations for
00231       // implementations of other cases.
00232       bool
00233       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
00234                      const _StrTransT& __s) const
00235       { return __first <= __s && __s <= __last; }
00236 
00237     protected:
00238       bool _M_in_range_icase(_CharT __first, _CharT __last, _CharT __ch) const
00239       {
00240         typedef std::ctype<_CharT> __ctype_type;
00241         const auto& __fctyp = use_facet<__ctype_type>(this->_M_traits.getloc());
00242         auto __lower = __fctyp.tolower(__ch);
00243         auto __upper = __fctyp.toupper(__ch);
00244         return (__first <= __lower && __lower <= __last)
00245           || (__first <= __upper && __upper <= __last);
00246       }
00247 
00248       const _TraitsT& _M_traits;
00249     };
00250 
00251   template<typename _TraitsT, bool __icase, bool __collate>
00252     class _RegexTranslator
00253     : public _RegexTranslatorBase<_TraitsT, __icase, __collate>
00254     {
00255     public:
00256       typedef _RegexTranslatorBase<_TraitsT, __icase, __collate> _Base;
00257       using _Base::_Base;
00258     };
00259 
00260   template<typename _TraitsT, bool __icase>
00261     class _RegexTranslator<_TraitsT, __icase, false>
00262     : public _RegexTranslatorBase<_TraitsT, __icase, false>
00263     {
00264     public:
00265       typedef _RegexTranslatorBase<_TraitsT, __icase, false> _Base;
00266       typedef typename _Base::_CharT _CharT;
00267       typedef _CharT _StrTransT;
00268 
00269       using _Base::_Base;
00270 
00271       _StrTransT
00272       _M_transform(_CharT __ch) const
00273       { return __ch; }
00274 
00275       bool
00276       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
00277       {
00278         if (!__icase)
00279           return __first <= __ch && __ch <= __last;
00280         return this->_M_in_range_icase(__first, __last, __ch);
00281       }
00282     };
00283 
00284   template<typename _CharType>
00285     class _RegexTranslator<std::regex_traits<_CharType>, true, true>
00286     : public _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
00287     {
00288     public:
00289       typedef _RegexTranslatorBase<std::regex_traits<_CharType>, true, true>
00290         _Base;
00291       typedef typename _Base::_CharT _CharT;
00292       typedef typename _Base::_StrTransT _StrTransT;
00293 
00294       using _Base::_Base;
00295 
00296       bool
00297       _M_match_range(const _StrTransT& __first, const _StrTransT& __last,
00298                      const _StrTransT& __str) const
00299       {
00300         __glibcxx_assert(__first.size() == 1);
00301         __glibcxx_assert(__last.size() == 1);
00302         __glibcxx_assert(__str.size() == 1);
00303         return this->_M_in_range_icase(__first[0], __last[0], __str[0]);
00304       }
00305     };
00306 
00307   template<typename _TraitsT>
00308     class _RegexTranslator<_TraitsT, false, false>
00309     {
00310     public:
00311       typedef typename _TraitsT::char_type _CharT;
00312       typedef _CharT                       _StrTransT;
00313 
00314       explicit
00315       _RegexTranslator(const _TraitsT&)
00316       { }
00317 
00318       _CharT
00319       _M_translate(_CharT __ch) const
00320       { return __ch; }
00321 
00322       _StrTransT
00323       _M_transform(_CharT __ch) const
00324       { return __ch; }
00325 
00326       bool
00327       _M_match_range(_CharT __first, _CharT __last, _CharT __ch) const
00328       { return __first <= __ch && __ch <= __last; }
00329     };
00330 
00331   template<typename _TraitsT, bool __is_ecma, bool __icase, bool __collate>
00332     struct _AnyMatcher;
00333 
00334   template<typename _TraitsT, bool __icase, bool __collate>
00335     struct _AnyMatcher<_TraitsT, false, __icase, __collate>
00336     {
00337       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00338       typedef typename _TransT::_CharT                       _CharT;
00339 
00340       explicit
00341       _AnyMatcher(const _TraitsT& __traits)
00342       : _M_translator(__traits)
00343       { }
00344 
00345       bool
00346       operator()(_CharT __ch) const
00347       {
00348         static auto __nul = _M_translator._M_translate('\0');
00349         return _M_translator._M_translate(__ch) != __nul;
00350       }
00351 
00352       _TransT _M_translator;
00353     };
00354 
00355   template<typename _TraitsT, bool __icase, bool __collate>
00356     struct _AnyMatcher<_TraitsT, true, __icase, __collate>
00357     {
00358       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00359       typedef typename _TransT::_CharT                       _CharT;
00360 
00361       explicit
00362       _AnyMatcher(const _TraitsT& __traits)
00363       : _M_translator(__traits)
00364       { }
00365 
00366       bool
00367       operator()(_CharT __ch) const
00368       { return _M_apply(__ch, typename is_same<_CharT, char>::type()); }
00369 
00370       bool
00371       _M_apply(_CharT __ch, true_type) const
00372       {
00373         auto __c = _M_translator._M_translate(__ch);
00374         auto __n = _M_translator._M_translate('\n');
00375         auto __r = _M_translator._M_translate('\r');
00376         return __c != __n && __c != __r;
00377       }
00378 
00379       bool
00380       _M_apply(_CharT __ch, false_type) const
00381       {
00382         auto __c = _M_translator._M_translate(__ch);
00383         auto __n = _M_translator._M_translate('\n');
00384         auto __r = _M_translator._M_translate('\r');
00385         auto __u2028 = _M_translator._M_translate(u'\u2028');
00386         auto __u2029 = _M_translator._M_translate(u'\u2029');
00387         return __c != __n && __c != __r && __c != __u2028 && __c != __u2029;
00388       }
00389 
00390       _TransT _M_translator;
00391     };
00392 
00393   template<typename _TraitsT, bool __icase, bool __collate>
00394     struct _CharMatcher
00395     {
00396       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00397       typedef typename _TransT::_CharT                       _CharT;
00398 
00399       _CharMatcher(_CharT __ch, const _TraitsT& __traits)
00400       : _M_translator(__traits), _M_ch(_M_translator._M_translate(__ch))
00401       { }
00402 
00403       bool
00404       operator()(_CharT __ch) const
00405       { return _M_ch == _M_translator._M_translate(__ch); }
00406 
00407       _TransT _M_translator;
00408       _CharT  _M_ch;
00409     };
00410 
00411   /// Matches a character range (bracket expression)
00412   template<typename _TraitsT, bool __icase, bool __collate>
00413     struct _BracketMatcher
00414     {
00415     public:
00416       typedef _RegexTranslator<_TraitsT, __icase, __collate> _TransT;
00417       typedef typename _TransT::_CharT                       _CharT;
00418       typedef typename _TransT::_StrTransT                   _StrTransT;
00419       typedef typename _TraitsT::string_type                 _StringT;
00420       typedef typename _TraitsT::char_class_type             _CharClassT;
00421 
00422     public:
00423       _BracketMatcher(bool __is_non_matching,
00424                       const _TraitsT& __traits)
00425       : _M_class_set(0), _M_translator(__traits), _M_traits(__traits),
00426       _M_is_non_matching(__is_non_matching)
00427       { }
00428 
00429       bool
00430       operator()(_CharT __ch) const
00431       {
00432         _GLIBCXX_DEBUG_ASSERT(_M_is_ready);
00433         return _M_apply(__ch, _UseCache());
00434       }
00435 
00436       void
00437       _M_add_char(_CharT __c)
00438       {
00439         _M_char_set.push_back(_M_translator._M_translate(__c));
00440         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00441       }
00442 
00443       _StringT
00444       _M_add_collate_element(const _StringT& __s)
00445       {
00446         auto __st = _M_traits.lookup_collatename(__s.data(),
00447                                                  __s.data() + __s.size());
00448         if (__st.empty())
00449           __throw_regex_error(regex_constants::error_collate,
00450                               "Invalid collate element.");
00451         _M_char_set.push_back(_M_translator._M_translate(__st[0]));
00452         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00453         return __st;
00454       }
00455 
00456       void
00457       _M_add_equivalence_class(const _StringT& __s)
00458       {
00459         auto __st = _M_traits.lookup_collatename(__s.data(),
00460                                                  __s.data() + __s.size());
00461         if (__st.empty())
00462           __throw_regex_error(regex_constants::error_collate,
00463                               "Invalid equivalence class.");
00464         __st = _M_traits.transform_primary(__st.data(),
00465                                            __st.data() + __st.size());
00466         _M_equiv_set.push_back(__st);
00467         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00468       }
00469 
00470       // __neg should be true for \D, \S and \W only.
00471       void
00472       _M_add_character_class(const _StringT& __s, bool __neg)
00473       {
00474         auto __mask = _M_traits.lookup_classname(__s.data(),
00475                                                  __s.data() + __s.size(),
00476                                                  __icase);
00477         if (__mask == 0)
00478           __throw_regex_error(regex_constants::error_collate,
00479                               "Invalid character class.");
00480         if (!__neg)
00481           _M_class_set |= __mask;
00482         else
00483           _M_neg_class_set.push_back(__mask);
00484         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00485       }
00486 
00487       void
00488       _M_make_range(_CharT __l, _CharT __r)
00489       {
00490         if (__l > __r)
00491           __throw_regex_error(regex_constants::error_range,
00492                               "Invalid range in bracket expression.");
00493         _M_range_set.push_back(make_pair(_M_translator._M_transform(__l),
00494                                          _M_translator._M_transform(__r)));
00495         _GLIBCXX_DEBUG_ONLY(_M_is_ready = false);
00496       }
00497 
00498       void
00499       _M_ready()
00500       {
00501         std::sort(_M_char_set.begin(), _M_char_set.end());
00502         auto __end = std::unique(_M_char_set.begin(), _M_char_set.end());
00503         _M_char_set.erase(__end, _M_char_set.end());
00504         _M_make_cache(_UseCache());
00505         _GLIBCXX_DEBUG_ONLY(_M_is_ready = true);
00506       }
00507 
00508     private:
00509       // Currently we only use the cache for char
00510       typedef typename std::is_same<_CharT, char>::type _UseCache;
00511 
00512       static constexpr size_t
00513       _S_cache_size =
00514         1ul << (sizeof(_CharT) * __CHAR_BIT__ * int(_UseCache::value));
00515 
00516       struct _Dummy { };
00517       typedef typename std::conditional<_UseCache::value,
00518                                         std::bitset<_S_cache_size>,
00519                                         _Dummy>::type _CacheT;
00520       typedef typename std::make_unsigned<_CharT>::type _UnsignedCharT;
00521 
00522       bool
00523       _M_apply(_CharT __ch, false_type) const;
00524 
00525       bool
00526       _M_apply(_CharT __ch, true_type) const
00527       { return _M_cache[static_cast<_UnsignedCharT>(__ch)]; }
00528 
00529       void
00530       _M_make_cache(true_type)
00531       {
00532         for (unsigned __i = 0; __i < _M_cache.size(); __i++)
00533           _M_cache[__i] = _M_apply(static_cast<_CharT>(__i), false_type());
00534       }
00535 
00536       void
00537       _M_make_cache(false_type)
00538       { }
00539 
00540     private:
00541       std::vector<_CharT>                       _M_char_set;
00542       std::vector<_StringT>                     _M_equiv_set;
00543       std::vector<pair<_StrTransT, _StrTransT>> _M_range_set;
00544       std::vector<_CharClassT>                  _M_neg_class_set;
00545       _CharClassT                               _M_class_set;
00546       _TransT                                   _M_translator;
00547       const _TraitsT&                           _M_traits;
00548       bool                                      _M_is_non_matching;
00549       _CacheT                                   _M_cache;
00550 #ifdef _GLIBCXX_DEBUG
00551       bool                                      _M_is_ready = false;
00552 #endif
00553     };
00554 
00555  //@} regex-detail
00556 } // namespace __detail
00557 _GLIBCXX_END_NAMESPACE_VERSION
00558 } // namespace std
00559 
00560 #include <bits/regex_compiler.tcc>