Logo Search packages:      
Sourcecode: octave3.1 version File versions

regexp.cc

/*

Copyright (C) 2005, 2006, 2007 David Bateman
Copyright (C) 2002, 2003, 2004, 2005 Paul Kienzle

This file is part of Octave.

Octave is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
Free Software Foundation; either version 3 of the License, or (at your
option) any later version.

Octave is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with Octave; see the file COPYING.  If not, see
<http://www.gnu.org/licenses/>.

*/

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include <algorithm>
#include <sstream>

#include "defun-dld.h"
#include "error.h"
#include "gripes.h"
#include "oct-obj.h"
#include "utils.h"

#include "Cell.h"
#include "oct-map.h"
#include "str-vec.h"
#include "quit.h"
#include "parse.h"
#include "oct-locbuf.h"

#if defined (HAVE_PCRE)
#include <pcre.h>
#elif defined (HAVE_REGEX)
#if defined (__MINGW32__)
#define __restrict
#endif
#if defined (HAVE_SYS_TYPES_H)
#include <sys/types.h>
#endif
#include <regex.h>
#endif

// Define the maximum number of retries for a pattern that 
// possibly results in an infinite recursion.
#define PCRE_MATCHLIMIT_MAX 10

// The regexp is constructed as a linked list to avoid resizing the
// return values in arrays at each new match.

// FIXME don't bother collecting and composing return values the user
// doesn't want.

class regexp_elem
{
public:
  regexp_elem (const string_vector& _named_token, const Cell& _t, 
             const std::string& _m, const Matrix& _te, double _s, 
             double _e) :
    named_token (_named_token), t (_t), m (_m), te (_te), s (_s), e (_e) { }

  regexp_elem (const regexp_elem &a) : named_token (a.named_token), t (a.t), 
                               m (a.m), te (a.te), s (a.s), e (a.e)
                               { }

  string_vector named_token;
  Cell t;
  std::string m;
  Matrix te;
  double s;
  double e;
};

typedef std::list<regexp_elem>::const_iterator const_iterator;

#define MAXLOOKBEHIND 10
static bool lookbehind_warned = false;

static int
octregexp_list (const octave_value_list &args, const std::string &nm, 
            bool case_insensitive, std::list<regexp_elem> &lst, 
            string_vector &named, int &nopts, bool &once)
{
  int sz = 0;
#if defined (HAVE_REGEX) || defined (HAVE_PCRE) 
  int nargin = args.length();
  bool lineanchors = false;
  bool dotexceptnewline = false;
  bool freespacing = false;

  nopts = nargin - 2;
  once = false;

  std::string buffer = args(0).string_value ();
  size_t max_length = (buffer.length () > MAXLOOKBEHIND ? 
                   MAXLOOKBEHIND: buffer.length ());

  if (error_state)
    {
      gripe_wrong_type_arg (nm.c_str(), args(0));
      return 0;
    }

  std::string pattern = args(1).string_value ();
  if (error_state)
    {
      gripe_wrong_type_arg (nm.c_str(), args(1));
      return 0;
    }

  for (int i = 2; i < nargin; i++)
    {
      std::string str = args(i).string_value();
      if (error_state)
      {
        error ("%s: optional arguments must be strings", nm.c_str());
        break;
      }
      std::transform (str.begin (), str.end (), str.begin (), tolower);
      if (str.find("once", 0) == 0)
      {
        once = true;
        nopts--;
      }
      else if (str.find("matchcase", 0) == 0)
      {
        case_insensitive = false;
        nopts--;
      }
      else if (str.find("ignorecase", 0) == 0)
      {
        case_insensitive = true;
        nopts--;
      }
      else if (str.find("dotall", 0) == 0)
      {
        dotexceptnewline = false;
        nopts--;
      }
      else if (str.find("stringanchors", 0) == 0)
      {
        lineanchors = false;
        nopts--;
      }
      else if (str.find("literalspacing", 0) == 0)
      {
        freespacing = false;
        nopts--;
      }
#if HAVE_PCRE
      // Only accept these options with pcre
      else if (str.find("dotexceptnewline", 0) == 0)
      {
        dotexceptnewline = true;
        nopts--;
      }
      else if (str.find("lineanchors", 0) == 0)
      {
        lineanchors = true;
        nopts--;
      }
      else if (str.find("freespacing", 0) == 0)
      {
        freespacing = true;
        nopts--;
      }
      else if (str.find("start", 0) && str.find("end", 0) &&
             str.find("tokenextents", 0) && str.find("match", 0) &&
             str.find("tokens", 0) && str.find("names", 0))
      error ("%s: unrecognized option", nm.c_str());
#else
      else if (str.find("names", 0) == 0 ||
             str.find("dotexceptnewline", 0) == 0 ||
             str.find("lineanchors", 0) == 0 ||
             str.find("freespacing", 0) == 0)
       error ("%s: %s not implemented in this version", str.c_str(), nm.c_str());
      else if (str.find("start", 0) && str.find("end", 0) &&
             str.find("tokenextents", 0) && str.find("match", 0) &&
             str.find("tokens", 0))
      error ("%s: unrecognized option", nm.c_str());
#endif
    }

  if (!error_state)
    {
      Cell t;
      std::string m;
      double s, e;

      // named tokens "(?<name>...)" are only treated with PCRE not regex.
#if HAVE_PCRE
      
      size_t pos = 0;
      size_t new_pos;
      int nnames = 0;
      int inames = 0;
      std::ostringstream buf;
      Array<int> named_idx;

      while ((new_pos = pattern.find ("(?",pos)) != std::string::npos)
      {
        if (pattern.at (new_pos + 2) == '<' &&  
            !(pattern.at (new_pos + 3) == '=' ||
            pattern.at (new_pos + 3) == '!'))
          {
            // The syntax of named tokens in pcre is "(?P<name>...)" while
            // we need a syntax "(?<name>...)", so fix that here. Also an 
            // expression like 
            // "(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)" 
            // should be perfectly legal, while pcre does not allow the same
            // named token name on both sides of the alternative. Also fix
            // that here by replacing name tokens by dummy names, and dealing
            // with the dummy names later.

            size_t tmp_pos = pattern.find_first_of ('>',new_pos);

            if (tmp_pos == std::string::npos)
            {
              error ("syntax error in pattern");
              break;
            }

            std::string tmp_name = 
            pattern.substr(new_pos+3,tmp_pos-new_pos-3);
            bool found = false;

            for (int i = 0; i < nnames; i++)
            if (named(i) == tmp_name)
              {
                named_idx.resize(inames+1);
                named_idx(inames) = i;
                found = true;
                break;
              }
            if (! found)
            {
              named_idx.resize(inames+1);
              named_idx(inames) = nnames;
              named.append(tmp_name);
              nnames++;
            }

            if (new_pos - pos > 0)
            buf << pattern.substr(pos,new_pos-pos);
            if (inames < 10)
            buf << "(?P<n00" << inames++;
            else if (inames < 100)
            buf << "(?P<n0" << inames++;
            else
            buf << "(?P<n" << inames++;
            pos = tmp_pos;
          }
        else if (pattern.at (new_pos + 2) == '<')
          {
            // Find lookbehind operators of arbitrary length (ie like 
            // "(?<=[a-z]*)") and replace with a maximum length operator 
            // as PCRE can not yet handle arbitrary length lookahead 
            // operators. Use the string length as the maximum length to 
            // avoid issues.

            int brackets = 1;
            size_t tmp_pos1 = new_pos + 2;
            size_t tmp_pos2 = tmp_pos1;
            while (tmp_pos1 <= pattern.length () && brackets > 0)
            {
              char ch = pattern.at (tmp_pos1);
              if (ch == '(')
                brackets++;
              else if (ch == ')')
                {
                  if (brackets > 1)
                  tmp_pos2 = tmp_pos1;

                  brackets--;
                }
              tmp_pos1++;
            }

            if (brackets != 0)
            {
              buf << pattern.substr (pos, new_pos - pos) << "(?";
              pos = new_pos + 2;
            }
            else
            {
              size_t tmp_pos3 = pattern.find_first_of ("*+", tmp_pos2);
              if (tmp_pos3 != std::string::npos && tmp_pos3 < tmp_pos1)
                {
                  if (!lookbehind_warned)
                  {
                    lookbehind_warned = true;
                    warning ("%s: arbitrary length lookbehind patterns are only support up to length %d", nm.c_str(), MAXLOOKBEHIND);
                  }

                  buf << pattern.substr (pos, new_pos - pos) << "(";

                  size_t i;
                  if (pattern.at (tmp_pos3) == '*')
                  i = 0;
                  else
                  i = 1;

                  for (; i < max_length + 1; i++)
                  {
                    buf << pattern.substr(new_pos, tmp_pos3 - new_pos)
                        << "{" << i << "}";
                    buf << pattern.substr(tmp_pos3 + 1, 
                                    tmp_pos1 - tmp_pos3 - 1);
                    if (i != max_length)
                      buf << "|";
                  }
                  buf << ")";
                }
              else
                buf << pattern.substr (pos, tmp_pos1 - pos);
              pos = tmp_pos1;
            }
          }
        else
          {
            buf << pattern.substr (pos, new_pos - pos) << "(?";
            pos = new_pos + 2;
          }

      }

      buf << pattern.substr(pos);

      if (error_state)
      return 0;

      // Compile expression
      pcre *re;
      const char *err;
      int erroffset;
      std::string buf_str = buf.str ();
      re = pcre_compile (buf_str.c_str (),
                   (case_insensitive ? PCRE_CASELESS : 0) |
                   (dotexceptnewline ? 0 : PCRE_DOTALL) |
                   (lineanchors ? PCRE_MULTILINE : 0) |
                   (freespacing ? PCRE_EXTENDED : 0),
                   &err, &erroffset, 0);
    
      if (re == 0) {
      error("%s: %s at position %d of expression", nm.c_str(), 
            err, erroffset);
      return 0;
      }

      int subpatterns;
      int namecount;
      int nameentrysize;
      char *nametable;
      int idx = 0;

      pcre_fullinfo(re, 0, PCRE_INFO_CAPTURECOUNT,  &subpatterns);
      pcre_fullinfo(re, 0, PCRE_INFO_NAMECOUNT, &namecount);
      pcre_fullinfo(re, 0, PCRE_INFO_NAMEENTRYSIZE, &nameentrysize);
      pcre_fullinfo(re, 0, PCRE_INFO_NAMETABLE, &nametable);

      OCTAVE_LOCAL_BUFFER(int, ovector, (subpatterns+1)*3);
      OCTAVE_LOCAL_BUFFER(int, nidx, namecount);

      for (int i = 0; i < namecount; i++)
      {
        // Index of subpattern in first two bytes MSB first of name.
        // Extract index.
        nidx[i] = (static_cast<int>(nametable[i*nameentrysize])) << 8 |
          static_cast<int>(nametable[i*nameentrysize+1]);
      }

      while(true)
      {
        OCTAVE_QUIT;

        int matches = pcre_exec(re, 0, buffer.c_str(), 
                          buffer.length(), idx, 
                          (idx ? PCRE_NOTBOL : 0),
                          ovector, (subpatterns+1)*3);

        if (matches == PCRE_ERROR_MATCHLIMIT)
          {
            // try harder; start with default value for MATCH_LIMIT and increase it
            warning("Your pattern caused PCRE to hit its MATCH_LIMIT.\nTrying harder now, but this will be slow.");
            pcre_extra pe;
            pcre_config(PCRE_CONFIG_MATCH_LIMIT, static_cast <void *> (&pe.match_limit));
            pe.flags = PCRE_EXTRA_MATCH_LIMIT;

            int i = 0;
            while (matches == PCRE_ERROR_MATCHLIMIT &&
                 i++ < PCRE_MATCHLIMIT_MAX)
            {
              OCTAVE_QUIT;

              pe.match_limit *= 10;
              matches = pcre_exec(re, &pe, buffer.c_str(), 
                              buffer.length(), idx, 
                              (idx ? PCRE_NOTBOL : 0),
                              ovector, (subpatterns+1)*3);
            }
          }

        if (matches < 0 && matches != PCRE_ERROR_NOMATCH)
          {
            error ("%s: internal error calling pcre_exec\nError code from pcre_exec is %i", nm.c_str(), matches);
            pcre_free(re);
            return 0;
          }
        else if (matches == PCRE_ERROR_NOMATCH)
          break;
        else if (ovector[1] <= ovector[0])
          {
            // FIXME: Zero sized match!! Is this the right thing to do?
            idx = ovector[0] + 1;
            continue;
          }
        else
          {
            int pos_match = 0;
            Matrix te(matches-1,2);
            for (int i = 1; i < matches; i++)
            {
              if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
                {
                  te(pos_match,0) = double (ovector[2*i]+1);
                  te(pos_match++,1) = double (ovector[2*i+1]);
                }
            }
            te.resize(pos_match,2);
            s = double (ovector[0]+1);
            e = double (ovector[1]);

            const char **listptr;
            int status = pcre_get_substring_list(buffer.c_str(), ovector, 
                                       matches, &listptr);

            if (status == PCRE_ERROR_NOMEMORY) {
            error("%s: cannot allocate memory in pcre_get_substring_list",
                  nm.c_str());
            pcre_free(re);
            return 0;
            }

            Cell cell_t (dim_vector(1,pos_match));
            pos_match = 0;
            for (int i = 1; i < matches; i++)
            if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)
              cell_t(pos_match++) = std::string(*(listptr+i));

            m =  std::string(*listptr);
            t = cell_t;

            string_vector named_tokens(nnames);
            if (namecount > 0)
            for (int i = 1; i < matches; i++)
              {
                if (ovector[2*i] >= 0 && ovector[2*i+1] > 0)      
                  {
                  named_tokens(named_idx(i-1)) = 
                    std::string(*(listptr+nidx[i-1]));
                  }
              }

            pcre_free_substring_list(listptr);

            regexp_elem new_elem (named_tokens, t, m, te, s, e);
            lst.push_back (new_elem);
            idx = ovector[1];
            sz++;

            if (once)
            break;

          }
      }

      pcre_free(re);
#else
      regex_t compiled;
      int err=regcomp(&compiled, pattern.c_str(), REG_EXTENDED | 
                  (case_insensitive ? REG_ICASE : 0));
      if (err)
      {
        int len = regerror(err, &compiled, 0, 0);
        OCTAVE_LOCAL_BUFFER (char, errmsg, len);
        regerror(err, &compiled, errmsg, len);
        error("%s: %s in pattern (%s)", nm.c_str(), errmsg, 
            pattern.c_str());
        regfree(&compiled);
        return 0;
      }

      int subexpr = 1;
      int idx = 0;
      for (unsigned int i=0; i < pattern.length(); i++)
        subexpr += ( pattern[i] == '(' ? 1 : 0 );
      OCTAVE_LOCAL_BUFFER (regmatch_t, match, subexpr );

      while(true)
      {
        OCTAVE_QUIT; 

        if (regexec(&compiled, buffer.c_str() + idx, subexpr, 
                  match, (idx ? REG_NOTBOL : 0)) == 0) 
          {
            // Count actual matches
            int matches = 0;
            while (matches < subexpr && match[matches].rm_so >= 0) 
            matches++;

            if (matches == 0 || match[0].rm_eo == 0)
            break;

            s = double (match[0].rm_so+1+idx);
            e = double (match[0].rm_eo+idx);
            Matrix te(matches-1,2);
            for (int i = 1; i < matches; i++)
            {
              te(i-1,0) = double (match[i].rm_so+1+idx);
              te(i-1,1) = double (match[i].rm_eo+idx);
            }

            m =  buffer.substr (match[0].rm_so+idx, 
                               match[0].rm_eo-match[0].rm_so);

            Cell cell_t (dim_vector(1,matches-1));
            for (int i = 1; i < matches; i++)
            cell_t(i-1) = buffer.substr (match[i].rm_so+idx, 
                                   match[i].rm_eo-match[i].rm_so);
            t = cell_t;

            idx += match[0].rm_eo;

            string_vector sv;
            regexp_elem new_elem (sv, t, m, te, s, e);
            lst.push_back (new_elem);
            sz++;

            if (once)
            break;
          }
        else
          break;
      }
      regfree(&compiled);
#endif
    }
#else
  error ("%s: not available in this version of Octave", nm.c_str());
#endif
  return sz;
}

static octave_value_list
octregexp (const octave_value_list &args, int nargout, const std::string &nm,
         bool case_insensitive)
{
  octave_value_list retval;
  int nargin = args.length();
  std::list<regexp_elem> lst;
  string_vector named;
  int nopts;
  bool once;
  int sz = octregexp_list (args, nm, case_insensitive, lst, named, nopts, once);

  if (! error_state)
    {
      // Converted the linked list in the correct form for the return values

      octave_idx_type i = 0;
#ifdef HAVE_PCRE
      Octave_map nmap;
      if (sz == 1)
      {
        for (int j = 0; j < named.length(); j++)
          nmap.assign (named(j), lst.begin()->named_token(j));
        retval(5) = nmap;
      }
      else
      {
        for (int j = 0; j < named.length (); j++)
          {
            i = 0;
            Cell tmp(dim_vector (1, sz));
            for (const_iterator p = lst.begin(); p != lst.end(); p++)
            tmp(i++) = p->named_token(j);
            nmap.assign (named(j), octave_value (tmp));
          }
        retval(5) = nmap;
      }
#else
      retval(5) = Octave_map();
#endif

      if (once)
        retval(4) = sz ? lst.front ().t : Cell();
      else
        {
          Cell t (dim_vector(1, sz));
          i = 0;
          for (const_iterator p = lst.begin(); p != lst.end(); p++)
            t(i++) = p->t;
          retval(4) = t;
        }

      if (once)
        retval(3) = sz ? lst.front ().m : std::string();
      else
        {
          Cell m (dim_vector(1, sz));
          i = 0;
          for (const_iterator p = lst.begin(); p != lst.end(); p++)
            m(i++) = p->m;
          retval(3) = m;
        }

      if (once)
        retval(2) = sz ? lst.front ().te : Matrix();
      else
        {
          Cell te (dim_vector(1, sz));
          i = 0;
          for (const_iterator p = lst.begin(); p != lst.end(); p++)
            te(i++) = p->te;
          retval(2) = te;
        }

      if (once)
        {
          if (sz)
            retval(1) = lst.front ().e;
          else
            retval(1) = Matrix();
        }
      else
        {
          NDArray e (dim_vector(1, sz));
          i = 0;
          for (const_iterator p = lst.begin(); p != lst.end(); p++)
            e(i++) = p->e;
          retval(1) = e;
        }

      if (once)
        {
          if (sz)
            retval(0) = lst.front ().s;
          else
            retval(0) = Matrix();
        }
      else
        {
      NDArray s (dim_vector(1, sz));
      i = 0;
      for (const_iterator p = lst.begin(); p != lst.end(); p++)
      s(i++) = p->s;
      retval(0) = s;
        }

      // Alter the order of the output arguments
      if (nopts > 0)
      {
        int n = 0;
        octave_value_list new_retval;
        new_retval.resize(nargout);

        OCTAVE_LOCAL_BUFFER (int, arg_used, 6);
        for (int j = 0; j < 6; j++)
          arg_used[j] = false;
        
        for (int j = 2; j < nargin; j++)
          {
            int k = 0;
            std::string str = args(j).string_value();
            std::transform (str.begin (), str.end (), str.begin (), tolower);
            if (str.find("once", 0) == 0
              || str.find("stringanchors", 0) == 0
              || str.find("lineanchors", 0) == 0
              || str.find("matchcase", 0) == 0
              || str.find("ignorecase", 0) == 0
              || str.find("dotall", 0) == 0
              || str.find("dotexceptnewline", 0) == 0
              || str.find("literalspacing", 0) == 0
              || str.find("freespacing", 0) == 0
            )
            continue;
            else if (str.find("start", 0) == 0)
            k = 0;
            else if (str.find("end", 0) == 0)
            k = 1;
            else if (str.find("tokenextents", 0) == 0)
            k = 2;
            else if (str.find("match", 0) == 0)
            k = 3;
            else if (str.find("tokens", 0) == 0)
            k = 4;
            else if (str.find("names", 0) == 0)
            k = 5;

            new_retval(n++) = retval(k);
            arg_used[k] = true;

            if (n == nargout)
            break;
          }

        // Fill in the rest of the arguments
        if (n < nargout)
          {
            for (int j = 0; j < 6; j++)
            {
              if (! arg_used[j])
                new_retval(n++) = retval(j);
            }
          }

        retval = new_retval;
      }
    }

  return retval;
}

static octave_value_list
octcellregexp (const octave_value_list &args, int nargout, const std::string &nm,
             bool case_insensitive)
{
  octave_value_list retval;

  if (args(0).is_cell())
    {
      OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
      octave_value_list new_args = args;
      Cell cellstr = args(0).cell_value();
      if (args(1).is_cell())
      {
        Cell cellpat = args(1).cell_value();

        if (cellpat.numel() == 1)
          {
            for (int j = 0; j < nargout; j++)
            newretval[j].resize(cellstr.dims());

            new_args(1) = cellpat(0);

            for (octave_idx_type i = 0; i < cellstr.numel (); i++)
            {
              new_args(0) = cellstr(i);
              octave_value_list tmp = octregexp (new_args, nargout, nm, 
                                         case_insensitive);

              if (error_state)
                break;

              for (int j = 0; j < nargout; j++)
                newretval[j](i) = tmp(j);
            }
          }
        else if (cellstr.numel() == 1)
          {
            for (int j = 0; j < nargout; j++)
            newretval[j].resize(cellpat.dims());

            new_args(0) = cellstr(0);

            for (octave_idx_type i = 0; i < cellpat.numel (); i++)
            {
              new_args(1) = cellpat(i);
              octave_value_list tmp = octregexp (new_args, nargout, nm, 
                                         case_insensitive);

              if (error_state)
                break;

              for (int j = 0; j < nargout; j++)
                newretval[j](i) = tmp(j);
            }
          }
        else if (cellstr.numel() == cellpat.numel())
          {

            if (cellstr.dims() != cellpat.dims())
            error ("%s: Inconsistent cell array dimensions", nm.c_str());
            else
            {
              for (int j = 0; j < nargout; j++)
                newretval[j].resize(cellstr.dims());

              for (octave_idx_type i = 0; i < cellstr.numel (); i++)
                {
                  new_args(0) = cellstr(i);
                  new_args(1) = cellpat(i);

                  octave_value_list tmp = octregexp (new_args, nargout, nm, 
                                           case_insensitive);

                  if (error_state)
                  break;

                  for (int j = 0; j < nargout; j++)
                  newretval[j](i) = tmp(j);
                }
            }
          }
        else
          error ("regexp: cell array arguments must be scalar or equal size");
      }
      else
      {
        for (int j = 0; j < nargout; j++)
          newretval[j].resize(cellstr.dims());

        for (octave_idx_type i = 0; i < cellstr.numel (); i++)
          {
            new_args(0) = cellstr(i);
            octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive);

            if (error_state)
            break;

            for (int j = 0; j < nargout; j++)
            newretval[j](i) = tmp(j);
          }
      }

      if (!error_state)
      for (int j = 0; j < nargout; j++)
        retval(j) = octave_value (newretval[j]);
    }
  else if (args(1).is_cell())
    {
      OCTAVE_LOCAL_BUFFER (Cell, newretval, nargout);
      octave_value_list new_args = args;
      Cell cellpat = args(1).cell_value();

      for (int j = 0; j < nargout; j++)
      newretval[j].resize(cellpat.dims());

      for (octave_idx_type i = 0; i < cellpat.numel (); i++)
      {
        new_args(1) = cellpat(i);
        octave_value_list tmp = octregexp (new_args, nargout, nm, case_insensitive);

        if (error_state)
          break;

        for (int j = 0; j < nargout; j++)
          newretval[j](i) = tmp(j);
      }

      if (!error_state)
      for (int j = 0; j < nargout; j++)
        retval(j) = octave_value (newretval[j]);
    }
  else
    retval = octregexp (args, nargout, nm, case_insensitive);

  return retval;

}

DEFUN_DLD (regexp, args, nargout,
  "-*- texinfo -*-\n\
@deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexp (@var{str}, @var{pat})\n\
@deftypefnx {Loadable Function} {[@dots{}] =} regexp (@var{str}, @var{pat}, @var{opts}, @dots{})\n\
\n\
Regular expression string matching. Matches @var{pat} in @var{str} and\n\
returns the position and matching substrings or empty values if there are\n\
none.\n\
\n\
The matched pattern @var{pat} can include any of the standard regex\n\
operators, including:\n\
\n\
@table @code\n\
@item .\n\
Match any character\n\
@item * + ? @{@}\n\
Repetition operators, representing\n\
@table @code\n\
@item *\n\
Match zero or more times\n\
@item +\n\
Match one or more times\n\
@item ?\n\
Match zero or one times\n\
@item @{@}\n\
Match range operator, which is of the form @code{@{@var{n}@}} to match exactly\n\
@var{n} times, @code{@{@var{m},@}} to match @var{m} or more times,\n\
@code{@{@var{m},@var{n}@}} to match between @var{m} and @var{n} times.\n\
@end table\n\
@item [@dots{}] [^@dots{}]\n\
List operators, where for example @code{[ab]c} matches @code{ac} and @code{bc}\n\
@item ()\n\
Grouping operator\n\
@item |\n\
Alternation operator. Match one of a choice of regular expressions. The\n\
alternatives must be delimited by the grouping operator @code{()} above\n\
@item ^ $\n\
Anchoring operator. @code{^} matches the start of the string @var{str} and\n\
@code{$} the end\n\
@end table\n\
\n\
In addition the following escaped characters have special meaning. It should\n\
be noted that it is recommended to quote @var{pat} in single quotes rather\n\
than double quotes, to avoid the escape sequences being interpreted by Octave\n\
before being passed to @code{regexp}.\n\
\n\
@table @code\n\
@item \\b\n\
Match a word boundary\n\
@item \\B\n\
Match within a word\n\
@item \\w\n\
Matches any word character\n\
@item \\W\n\
Matches any non word character\n\
@item \\<\n\
Matches the beginning of a word\n\
@item \\>\n\
Matches the end of a word\n\
@item \\s\n\
Matches any whitespace character\n\
@item \\S\n\
Matches any non whitespace character\n\
@item \\d\n\
Matches any digit\n\
@item \\D\n\
Matches any non-digit\n\
@end table\n\
\n\
The outputs of @code{regexp} by default are in the order as given below\n\
\n\
@table @asis\n\
@item @var{s}\n\
The start indices of each of the matching substrings\n\
\n\
@item @var{e}\n\
The end indices of each matching substring\n\
\n\
@item @var{te}\n\
The extents of each of the matched token surrounded by @code{(@dots{})} in\n\
@var{pat}.\n\
\n\
@item @var{m}\n\
A cell array of the text of each match.\n\
\n\
@item @var{t}\n\
A cell array of the text of each token matched.\n\
\n\
@item @var{nm}\n\
A structure containing the text of each matched named token, with the name\n\
being used as the fieldname. A named token is denoted as\n\
@code{(?<name>@dots{})}\n\
@end table\n\
\n\
Particular output arguments or the order of the output arguments can be\n\
selected by additional @var{opts} arguments. These are strings and the\n\
correspondence between the output arguments and the optional argument\n\
are\n\
\n\
@multitable @columnfractions 0.2 0.3 0.3 0.2\n\
@item @tab 'start'        @tab @var{s}  @tab\n\
@item @tab 'end'          @tab @var{e}  @tab\n\
@item @tab 'tokenExtents' @tab @var{te} @tab\n\
@item @tab 'match'        @tab @var{m}  @tab\n\
@item @tab 'tokens'       @tab @var{t}  @tab\n\
@item @tab 'names'        @tab @var{nm}  @tab\n\
@end multitable\n\
\n\
A further optional argument is 'once', that limits the number of returned\n\
matches to the first match. Additional arguments are\n\
\n\
@table @asis\n\
@item matchcase\n\
Make the matching case sensitive.\n\
@item ignorecase\n\
Make the matching case insensitive.\n\
@item stringanchors\n\
Match the anchor characters at the beginning and end of the string.\n\
@item lineanchors\n\
Match the anchor characters at the beginning and end of the line.\n\
@item dotall\n\
The character @code{.} matches the newline character.\n\
@item dotexceptnewline\n\
The character @code{.} matches all but the newline character.\n\
@item freespacing\n\
The pattern can include arbitrary whitespace and comments starting with\n\
@code{#}.\n\
@item literalspacing\n\
The pattern is taken literally.\n\
@end table\n\
@seealso{regexpi, regexprep}\n\
@end deftypefn")
{
  octave_value_list retval;
  int nargin = args.length();

  if (nargin < 2)
    print_usage ();
  else if (args(0).is_cell() || args(1).is_cell())
    retval = octcellregexp (args, nargout, "regexp", false);
  else
    retval = octregexp (args, nargout, "regexp", false);

  return retval;
}

/*

## PCRE_ERROR_MATCHLIMIT test
%!test
%! s=sprintf('\t4\n0000\t-0.00\t-0.0000\t4\t-0.00\t-0.0000\t4\n0000\t-0.00\t-0.0000\t0\t-0.00\t-');
%! ws = warning("query");
%! unwind_protect
%!   warning("off");
%!   regexp(s, '(\s*-*\d+[.]*\d*\s*)+\n');
%! unwind_protect_cleanup
%!   warning(ws);
%! end_unwind_protect

## seg-fault test
%!assert(regexp("abcde","."),[1,2,3,4,5])

## Check that anchoring of pattern works correctly
%!assert(regexp('abcabc','^abc'),1);
%!assert(regexp('abcabc','abc$'),4);
%!assert(regexp('abcabc','^abc$'),zeros(1,0));

%!test
%! [s, e, te, m, t] = regexp(' No Match ', 'f(.*)uck');
%! assert (s,zeros(1,0))
%! assert (e,zeros(1,0))
%! assert (te,cell(1,0))
%! assert (m, cell(1,0))
%! assert (t, cell(1,0))

%!test
%! [s, e, te, m, t] = regexp(' FiRetrUck ', 'f(.*)uck');
%! assert (s,zeros(1,0))
%! assert (e,zeros(1,0))
%! assert (te,cell(1,0))
%! assert (m, cell(1,0))
%! assert (t, cell(1,0))

%!test
%! [s, e, te, m, t] = regexp(' firetruck ', 'f(.*)uck');
%! assert (s,2)
%! assert (e,10)
%! assert (te{1},[3,7])
%! assert (m{1}, 'firetruck')
%! assert (t{1}{1}, 'iretr')

%!test
%! [s, e, te, m, t] = regexp('short test string','\w*r\w*');
%! assert (s,[1,12])
%! assert (e,[5,17])
%! assert (size(te), [1,2])
%! assert (isempty(te{1}))
%! assert (isempty(te{2}))
%! assert (m{1},'short')
%! assert (m{2},'string')
%! assert (size(t), [1,2])
%! assert (isempty(t{1}))
%! assert (isempty(t{2}))

%!test
%! [s, e, te, m, t] = regexp('short test string','\w*r\w*','once');
%! assert (s,1)
%! assert (e,5)
%! assert (isempty(te))
%! assert (m,'short')
%! assert (isempty(t))

%!test
%! [m, te, e, s, t] = regexp('short test string','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s,1)
%! assert (e,5)
%! assert (isempty(te))
%! assert (m,'short')
%! assert (isempty(t))

%!testif HAVE_PCRE
%! ## This test is expected to fail if PCRE is not installed
%! [s, e, te, m, t, nm] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)');
%! assert (s,1)
%! assert (e,10)
%! assert (size(te), [1,1])
%! assert (te{1}, [1 5; 7, 10])
%! assert (m{1},'short test')
%! assert (size(t),[1,1])
%! assert (t{1}{1},'short')
%! assert (t{1}{2},'test')
%! assert (size(nm), [1,1])
%! assert (!isempty(fieldnames(nm)))
%! assert (sort(fieldnames(nm)),{'word1';'word2'})
%! assert (nm.word1,'short')
%! assert (nm.word2,'test')

%!testif HAVE_PCRE
%! ## This test is expected to fail if PCRE is not installed
%! [nm, m, te, e, s, t] = regexp('short test string','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s,1)
%! assert (e,10)
%! assert (size(te), [1,1])
%! assert (te{1}, [1 5; 7, 10])
%! assert (m{1},'short test')
%! assert (size(t),[1,1])
%! assert (t{1}{1},'short')
%! assert (t{1}{2},'test')
%! assert (size(nm), [1,1])
%! assert (!isempty(fieldnames(nm)))
%! assert (sort(fieldnames(nm)),{'word1';'word2'})
%! assert (nm.word1,'short')
%! assert (nm.word2,'test')

%!testif HAVE_PCRE
%! ## This test is expected to fail if PCRE is not installed
%! [t, nm] = regexp("John Davis\nRogers, James",'(?<first>\w+)\s+(?<last>\w+)|(?<last>\w+),\s+(?<first>\w+)','tokens','names');
%! assert (size(t), [1,2]);
%! assert (t{1}{1},'John');
%! assert (t{1}{2},'Davis');
%! assert (t{2}{1},'Rogers');
%! assert (t{2}{2},'James');
%! assert (size(nm), [1,1]);
%! assert (nm.first{1},'John');
%! assert (nm.first{2},'James');
%! assert (nm.last{1},'Davis');
%! assert (nm.last{2},'Rogers');

%!assert(regexp("abc\nabc",'.'),[1:7])
%!assert(regexp("abc\nabc",'.','dotall'),[1:7])
%!testif HAVE_PCRE
%! assert(regexp("abc\nabc",'(?s).'),[1:7])
%! assert(regexp("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7])
%! assert(regexp("abc\nabc",'(?-s).'),[1,2,3,5,6,7])

%!assert(regexp("caseCaSe",'case'),1)
%!assert(regexp("caseCaSe",'case',"matchcase"),1)
%!assert(regexp("caseCaSe",'case',"ignorecase"),[1,5])
%!testif HAVE_PCRE
%! assert(regexp("caseCaSe",'(?-i)case'),1)
%! assert(regexp("caseCaSe",'(?i)case'),[1,5])

%!assert (regexp("abc\nabc",'c$'),7)
%!assert (regexp("abc\nabc",'c$',"stringanchors"),7)
%!testif HAVE_PCRE
%! assert (regexp("abc\nabc",'(?-m)c$'),7)
%! assert (regexp("abc\nabc",'c$',"lineanchors"),[3,7])
%! assert (regexp("abc\nabc",'(?m)c$'),[3,7])

%!assert (regexp("this word",'s w'),4)
%!assert (regexp("this word",'s w','literalspacing'),4)
%!testif HAVE_PCRE
%! assert (regexp("this word",'(?-x)s w','literalspacing'),4)
%! assert (regexp("this word",'s w','freespacing'),zeros(1,0))
%! assert (regexp("this word",'(?x)s w'),zeros(1,0))

%!error regexp('string', 'tri', 'BadArg');
%!error regexp('string');

%!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)})
%!assert(regexp({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)})
%!assert(regexp({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]})
%!assert(regexp('Strings',{'t','s'}),{2,7})

## Test case for lookaround operators
%!assert(regexp('Iraq','q(?!u)'),4)
%!assert(regexp('quit','q(?!u)'), zeros(1,0))
%!assert(regexp('quit','q(?=u)','match'), {'q'})
%!assert(regexp("quit",'q(?=u+)','match'), {'q'})
%!assert(regexp("qit",'q(?=u+)','match'), cell(1,0))
%!assert(regexp("qit",'q(?=u*)','match'), {'q'})

%!assert(regexp('thingamabob','(?<=a)b'), 9)

*/

DEFUN_DLD (regexpi, args, nargout,
  "-*- texinfo -*-\n\
@deftypefn {Loadable Function} {[@var{s}, @var{e}, @var{te}, @var{m}, @var{t}, @var{nm}] =} regexpi (@var{str}, @var{pat})\n\
@deftypefnx {Loadable Function} {[@dots{}] =} regexpi (@var{str}, @var{pat}, @var{opts}, @dots{})\n\
\n\
Case insensitive regular expression string matching. Matches @var{pat} in\n\
@var{str} and returns the position and matching substrings or empty values\n\
if there are none.  @xref{doc-regexp,,regexp}, for more details\n\
@end deftypefn")
{
  octave_value_list retval;
  int nargin = args.length();

  if (nargin < 2)
    print_usage ();
  else if (args(0).is_cell() || args(1).is_cell())
    retval = octcellregexp (args, nargout, "regexpi", true);
  else
    retval = octregexp (args, nargout, "regexpi", true);

  return retval;
}

/*

## seg-fault test
%!assert(regexpi("abcde","."),[1,2,3,4,5])

## Check that anchoring of pattern works correctly
%!assert(regexpi('abcabc','^abc'),1);
%!assert(regexpi('abcabc','abc$'),4);
%!assert(regexpi('abcabc','^abc$'),zeros(1,0));

%!test
%! [s, e, te, m, t] = regexpi(' No Match ', 'f(.*)uck');
%! assert (s,zeros(1,0))
%! assert (e,zeros(1,0))
%! assert (te,cell(1,0))
%! assert (m, cell(1,0))
%! assert (t, cell(1,0))

%!test
%! [s, e, te, m, t] = regexpi(' FiRetrUck ', 'f(.*)uck');
%! assert (s,2)
%! assert (e,10)
%! assert (te{1},[3,7])
%! assert (m{1}, 'FiRetrUck')
%! assert (t{1}{1}, 'iRetr')

%!test
%! [s, e, te, m, t] = regexpi(' firetruck ', 'f(.*)uck');
%! assert (s,2)
%! assert (e,10)
%! assert (te{1},[3,7])
%! assert (m{1}, 'firetruck')
%! assert (t{1}{1}, 'iretr')

%!test
%! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*');
%! assert (s,[1,12])
%! assert (e,[5,17])
%! assert (size(te), [1,2])
%! assert (isempty(te{1}))
%! assert (isempty(te{2}))
%! assert (m{1},'ShoRt')
%! assert (m{2},'String')
%! assert (size(t), [1,2])
%! assert (isempty(t{1}))
%! assert (isempty(t{2}))

%!test
%! [s, e, te, m, t] = regexpi('ShoRt Test String','\w*r\w*','once');
%! assert (s,1)
%! assert (e,5)
%! assert (isempty(te))
%! assert (m,'ShoRt')
%! assert (isempty(t))

%!test
%! [m, te, e, s, t] = regexpi('ShoRt Test String','\w*r\w*','once', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s,1)
%! assert (e,5)
%! assert (isempty(te))
%! assert (m,'ShoRt')
%! assert (isempty(t))

%!testif HAVE_PCRE
%! ## This test is expected to fail if PCRE is not installed
%! [s, e, te, m, t, nm] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)');
%! assert (s,1)
%! assert (e,10)
%! assert (size(te), [1,1])
%! assert (te{1}, [1 5; 7, 10])
%! assert (m{1},'ShoRt Test')
%! assert (size(t),[1,1])
%! assert (t{1}{1},'ShoRt')
%! assert (t{1}{2},'Test')
%! assert (size(nm), [1,1])
%! assert (!isempty(fieldnames(nm)))
%! assert (sort(fieldnames(nm)),{'word1';'word2'})
%! assert (nm.word1,'ShoRt')
%! assert (nm.word2,'Test')

%!testif HAVE_PCRE
%! ## This test is expected to fail if PCRE is not installed
%! [nm, m, te, e, s, t] = regexpi('ShoRt Test String','(?<word1>\w*t)\s*(?<word2>\w*t)', 'names', 'match', 'tokenExtents', 'end', 'start', 'tokens');
%! assert (s,1)
%! assert (e,10)
%! assert (size(te), [1,1])
%! assert (te{1}, [1 5; 7, 10])
%! assert (m{1},'ShoRt Test')
%! assert (size(t),[1,1])
%! assert (t{1}{1},'ShoRt')
%! assert (t{1}{2},'Test')
%! assert (size(nm), [1,1])
%! assert (!isempty(fieldnames(nm)))
%! assert (sort(fieldnames(nm)),{'word1';'word2'})
%! assert (nm.word1,'ShoRt')
%! assert (nm.word2,'Test')

%!assert(regexpi("abc\nabc",'.'),[1:7])
%!assert(regexpi("abc\nabc",'.','dotall'),[1:7])
%!testif HAVE_PCRE
%! assert(regexpi("abc\nabc",'(?s).'),[1:7])
%! assert(regexpi("abc\nabc",'.','dotexceptnewline'),[1,2,3,5,6,7])
%! assert(regexpi("abc\nabc",'(?-s).'),[1,2,3,5,6,7])

%!assert(regexpi("caseCaSe",'case'),[1,5])
%!assert(regexpi("caseCaSe",'case',"matchcase"),1)
%!assert(regexpi("caseCaSe",'case',"ignorecase"),[1,5])
%!testif HAVE_PCRE
%! assert(regexpi("caseCaSe",'(?-i)case'),1)
%! assert(regexpi("caseCaSe",'(?i)case'),[1,5])

%!assert (regexpi("abc\nabc",'c$'),7)
%!assert (regexpi("abc\nabc",'c$',"stringanchors"),7)
%!testif HAVE_PCRE
%! assert (regexpi("abc\nabc",'(?-m)c$'),7)
%! assert (regexpi("abc\nabc",'c$',"lineanchors"),[3,7])
%! assert (regexpi("abc\nabc",'(?m)c$'),[3,7])

%!assert (regexpi("this word",'s w'),4)
%!assert (regexpi("this word",'s w','literalspacing'),4)
%!testif HAVE_PCRE
%! assert (regexpi("this word",'(?-x)s w','literalspacing'),4)
%! assert (regexpi("this word",'s w','freespacing'),zeros(1,0))
%! assert (regexpi("this word",'(?x)s w'),zeros(1,0))

%!error regexpi('string', 'tri', 'BadArg');
%!error regexpi('string');

%!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},'-'),{6;[1,5,9];zeros(1,0)})
%!assert(regexpi({'asdfg-dfd','-dfd-dfd-','qasfdfdaq'},'-'),{6,[1,5,9],zeros(1,0)})
%!assert(regexpi({'asdfg-dfd';'-dfd-dfd-';'qasfdfdaq'},{'-';'f';'q'}),{6;[3,7];[1,9]})
%!assert(regexpi('Strings',{'t','s'}),{2,[1,7]})

*/


static octave_value
octregexprep (const octave_value_list &args, const std::string &nm)
{
  octave_value retval;
  int nargin = args.length();

  // Make sure we have string,pattern,replacement
  const std::string buffer = args(0).string_value ();
  if (error_state) return retval;
  const std::string pattern = args(1).string_value ();
  if (error_state) return retval;
  const std::string replacement = args(2).string_value ();
  if (error_state) return retval;
  
  // Pack options excluding 'tokenize' and various output
  // reordering strings into regexp arg list
  octave_value_list regexpargs(nargin-1,octave_value());
  regexpargs(0) = args(0);
  regexpargs(1) = args(1);
  int len=2;
  for (int i = 3; i < nargin; i++) 
    {
      const std::string opt = args(i).string_value();
      if (opt != "tokenize" && opt != "start" && opt != "end"
        && opt != "tokenextents" && opt != "match" && opt != "tokens"
        && opt != "names"  && opt != "warnings") 
      {
        regexpargs(len++) = args(i);
      }
    }
  regexpargs.resize(len);
  
  // Identify replacement tokens; build a vector of group numbers in
  // the replacement string so that we can quickly calculate the size 
  // of the replacement.
  int tokens = 0;
  for (size_t i=1; i < replacement.size(); i++) 
    {
      if (replacement[i-1]=='$' && isdigit(replacement[i])) 
      {
        tokens++, i++;
      }
    }
  std::vector<int> token(tokens);
  int kk = 0;
  for (size_t i = 1; i < replacement.size(); i++) 
    {
      if (replacement[i-1]=='$' && isdigit(replacement[i])) 
      {
        token[kk++] = replacement[i]-'0';
        i++;
      }
    }

  // Perform replacement
  std::string rep;
  if (tokens > 0) 
    {
      std::list<regexp_elem> lst;
      string_vector named;
      int nopts;
      bool once;
      int sz = octregexp_list (regexpargs, nm , false, lst, named, nopts, once);

      if (error_state)
      return retval;
      if (sz == 0)
      {
        retval = args(0);
        return retval;
      }

      // Determine replacement length
      const size_t replen = replacement.size() - 2*tokens;
      int delta = 0;
      const_iterator p = lst.begin();
      for (int i = 0; i < sz; i++) 
      {
        OCTAVE_QUIT;

        const Matrix pairs(p->te);
        size_t pairlen = 0;
        for (int j = 0; j < tokens; j++) 
          {
            if (token[j] == 0) 
            pairlen += static_cast<size_t>(p->e - p->s) + 1;
            else if (token[j] <= pairs.rows()) 
            pairlen += static_cast<size_t>(pairs(token[j]-1,1) - 
                                     pairs(token[j]-1,0)) + 1;
          }
        delta += static_cast<int>(replen + pairlen) - 
          static_cast<int>(p->e - p->s + 1);
        p++;
      }
      
      // Build replacement string
      rep.reserve(buffer.size()+delta);
      size_t from = 0;
      p = lst.begin();
      for (int i=0; i < sz; i++) 
      {
        OCTAVE_QUIT;

        const Matrix pairs(p->te);
        rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from);
        from = static_cast<size_t>(p->e - 1) + 1;
        for (size_t j = 1; j < replacement.size(); j++) 
          {
            if (replacement[j-1]=='$' && isdigit(replacement[j])) 
            {
              int k = replacement[j]-'0';
              if (k == 0) 
                { 
                  // replace with entire match
                  rep.append(&buffer[static_cast<size_t>(p->e - 1)],
                         static_cast<size_t>(p->e - p->s) + 1);
                } 
              else if (k <= pairs.rows()) 
                {
                  // replace with group capture
                  rep.append(&buffer[static_cast<size_t>(pairs(k-1,0)-1)],
                         static_cast<size_t>(pairs(k-1,1) - 
                                         pairs(k-1,0))+1);
                }
              else 
                {
                  // replace with nothing
                }
              j++;
            } 
            else 
            {
              rep.append(1,replacement[j-1]);
            }
            if (j+1 == replacement.size()) 
            {
              rep.append(1,replacement[j]);
            }
          }
        p++;
      }
      rep.append(&buffer[from],buffer.size()-from);
    } 
  else 
    {
      std::list<regexp_elem> lst;
      string_vector named;
      int nopts;
      bool once;
      int sz = octregexp_list (regexpargs, nm, false, lst, named, nopts, once);

      if (error_state)
      return retval;
      if (sz == 0)
      {
        retval = args(0);
        return retval;
      }

      // Determine replacement length
      const size_t replen = replacement.size();
      int delta = 0;
      const_iterator p = lst.begin();
      for (int i = 0; i < sz; i++) 
      {
          OCTAVE_QUIT;
        delta += static_cast<int>(replen) - 
          static_cast<int>(p->e - p->s + 1);
        p++;
      }

      // Build replacement string
      rep.reserve(buffer.size()+delta);
      size_t from = 0;
      p = lst.begin();
      for (int i=0; i < sz; i++) 
      {
          OCTAVE_QUIT;
        rep.append(&buffer[from], static_cast<size_t>(p->s - 1) - from);
        from = static_cast<size_t>(p->e - 1) + 1;
        rep.append(replacement);
        p++;
      }
      rep.append(&buffer[from],buffer.size()-from);
    }
  
  retval = rep;
  return retval;
}

DEFUN_DLD (regexprep, args, ,
  "-*- texinfo -*-\n\
@deftypefn {Loadable Function}  {@var{string} =} regexprep (@var{string}, @var{pat}, @var{repstr}, @var{options})\n\
Replace matches of @var{pat} in  @var{string} with @var{repstr}.\n\
\n\
\n\
The replacement can contain @code{$i}, which substitutes\n\
for the ith set of parentheses in the match string.  E.g.,\n\
@example\n\
\n\
   regexprep(\"Bill Dunn\",'(\\w+) (\\w+)','$2, $1')\n\
\n\
@end example\n\
returns \"Dunn, Bill\"\n\
\n\
@var{options} may be zero or more of\n\
@table @samp\n\
\n\
@item once\n\
Replace only the first occurrence of @var{pat} in the result.\n\
\n\
@item warnings\n\
This option is present for compatibility but is ignored.\n\
\n\
@item ignorecase or matchcase\n\
Ignore case for the pattern matching (see @code{regexpi}).\n\
Alternatively, use (?i) or (?-i) in the pattern.\n\
\n\
@item lineanchors and stringanchors\n\
Whether characters ^ and $ match the beginning and ending of lines.\n\
Alternatively, use (?m) or (?-m) in the pattern.\n\
\n\
@item dotexceptnewline and dotall\n\
Whether . matches newlines in the string.\n\
Alternatively, use (?s) or (?-s) in the pattern.\n\
\n\
@item freespacing or literalspacing\n\
Whether whitespace and # comments can be used to make the regular expression more readable.\n\
Alternatively, use (?x) or (?-x) in the pattern.\n\
\n\
@end table\n\
@seealso{regexp,regexpi,strrep}\n\
@end deftypefn")
{
  octave_value_list retval;
  int nargin = args.length();

  if (nargin < 3)
    {
      print_usage ();
      return retval;
    }

  if (args(0).is_cell() || args(1).is_cell() || args(2).is_cell())
    {
      Cell str;
      Cell pat;
      Cell rep;
      dim_vector dv0;
      dim_vector dv1(1,1);

      if (args(0).is_cell())
      str = args(0).cell_value();
      else
      str = Cell (args(0));

      if (args(1).is_cell())
      pat = args(1).cell_value();
      else
      pat = Cell (args(1));

      if (args(2).is_cell())
      rep = args(2).cell_value();
      else
      rep = Cell (args(2));

      dv0 = str.dims();
      if (pat.numel() != 1)
      {
        dv1 = pat.dims();
        if (rep.numel() != 1 && dv1 != rep.dims())
          error ("regexprep: Inconsistent cell array dimensions");
      }
      else if (rep.numel() != 1)
      dv1 = rep.dims();

      if (!error_state)
      {
        Cell ret (dv0);
        octave_value_list new_args = args;

        for (octave_idx_type i = 0; i < dv0.numel(); i++)
          {
            new_args(0) = str(i);
            if (pat.numel() == 1)
            new_args(1) = pat(0);
            if (rep.numel() == 1)
            new_args(2) = rep(0);
            for (octave_idx_type j = 0; j < dv1.numel(); j++)
            {
              if (pat.numel() != 1)
                new_args(1) = pat(j);
              if (rep.numel() != 1)
                new_args(2) = rep(j);
              new_args(0) = octregexprep (new_args, "regexprep");

              if (error_state)
                break;
            }

            if (error_state)
            break;

            ret(i) = new_args(0);
          }

        if (!error_state)
          retval = octave_value (ret);
      }
    }
  else
    retval = octregexprep (args, "regexprep");

  return retval;
}

/*
%!test  # Replace with empty
%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
%! t = regexprep(xml,'<[!?][^>]*>','');
%! assert(t,' <tag v="hello">some stuff</tag>')

%!test  # Replace with non-empty
%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
%! t = regexprep(xml,'<[!?][^>]*>','?');
%! assert(t,'? <tag v="hello">some stuff?</tag>')

%!test  # Check that 'tokenize' is ignored
%! xml = '<!-- This is some XML --> <tag v="hello">some stuff<!-- sample tag--></tag>';
%! t = regexprep(xml,'<[!?][^>]*>','','tokenize');
%! assert(t,' <tag v="hello">some stuff</tag>')

%!testif HAVE_PCRE # Capture replacement
%! data = "Bob Smith\nDavid Hollerith\nSam Jenkins";
%! result = "Smith, Bob\nHollerith, David\nJenkins, Sam";
%! t = regexprep(data,'(?m)^(\w+)\s+(\w+)$','$2, $1');
%! assert(t,result)

# Return the original if no match
%!assert(regexprep('hello','world','earth'),'hello')

## Test a general replacement
%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_"), "a_b_c_d_e_f_g");

## Make sure it works at the beginning and end
%!assert(regexprep("a[b]c{d}e-f=g", "a", "_"), "_[b]c{d}e-f=g");
%!assert(regexprep("a[b]c{d}e-f=g", "g", "_"), "a[b]c{d}e-f=_");

## Options
%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Za-z0-9_]", "_", "once"), "a_b]c{d}e-f=g");
%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "ignorecase"), "a_b_c_d_e_f_g");

## Option combinations
%!assert(regexprep("a[b]c{d}e-f=g", "[^A-Z0-9_]", "_", "once", "ignorecase"), "a_b]c{d}e-f=g");

## End conditions on replacement
%!assert(regexprep("abc","(b)",".$1"),"a.bc");
%!assert(regexprep("abc","(b)","$1"),"abc");
%!assert(regexprep("abc","(b)","$1."),"ab.c");
%!assert(regexprep("abc","(b)","$1.."),"ab..c");

## Test cell array arguments
%!assert(regexprep("abc",{"b","a"},"?"),{"??c"})
%!assert(regexprep({"abc","cba"},"b","?"),{"a?c","c?a"})
%!assert(regexprep({"abc","cba"},{"b","a"},{"?","!"}),{"!?c","c?!"})

# Nasty lookbehind expression
%!assert(regexprep('x^(-1)+y(-1)+z(-1)=0','(?<=[a-z]+)\(\-[1-9]*\)','_minus1'),'x^(-1)+y_minus1+z_minus1=0')

*/

/*
;;; Local Variables: ***
;;; mode: C++ ***
;;; End: ***
*/

Generated by  Doxygen 1.6.0   Back to index