RegExpImpl.java example

Explorer
geogebra-master
/* -*- Mode: java; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 4 -*-
 *
 * This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this
 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */

package org.mozilla.javascript.regexp;

import org.mozilla.javascript.*;

/**
 *
 */
public class RegExpImpl implements RegExpProxy {

    public boolean isRegExp(Scriptable obj) {
        return obj instanceof NativeRegExp;
    }

    public Object compileRegExp(Context cx, String source, String flags)
    {
        return NativeRegExp.compileRE(cx, source, flags, false);
    }

    public Scriptable wrapRegExp(Context cx, Scriptable scope,
                                 Object compiled)
    {
        return new NativeRegExp(scope, (RECompiled) compiled);
    }

    public Object action(Context cx, Scriptable scope,
                         Scriptable thisObj, Object[] args,
                         int actionType)
    {
        GlobData data = new GlobData();
        data.mode = actionType;
        data.str = ScriptRuntime.toString(thisObj);

        switch (actionType) {
          case RA_MATCH:
            {
                NativeRegExp re = createRegExp(cx, scope, args, 1, false);
                Object rval = matchOrReplace(cx, scope, thisObj, args,
                                             this, data, re);
                return data.arrayobj == null ? rval : data.arrayobj;
            }

          case RA_SEARCH:
            {
                NativeRegExp re = createRegExp(cx, scope, args, 1, false);
                return matchOrReplace(cx, scope, thisObj, args,
                                      this, data, re);
            }

          case RA_REPLACE:
            {
                boolean useRE = (args.length > 0 && args[0] instanceof NativeRegExp)
                                || args.length > 2;
                NativeRegExp re = null;
                String search = null;
                if (useRE) {
                    re = createRegExp(cx, scope, args, 2, true);
                } else {
                    Object arg0 = args.length < 1 ? Undefined.instance : args[0];
                    search = ScriptRuntime.toString(arg0);
                }

                Object arg1 = args.length < 2 ? Undefined.instance : args[1];
                String repstr = null;
                Function lambda = null;
                if (arg1 instanceof Function) {
                    lambda = (Function) arg1;
                } else {
                    repstr = ScriptRuntime.toString(arg1);
                }

                data.lambda = lambda;
                data.repstr = repstr;
                data.dollar = repstr == null ? -1 : repstr.indexOf('$');
                data.charBuf = null;
                data.leftIndex = 0;

                Object val;
                if (useRE) {
                    val = matchOrReplace(cx, scope, thisObj, args,
                                         this, data, re);
                } else {
                    String str = data.str;
                    int index = str.indexOf(search);
                    if (index >= 0) {
                        int slen = search.length();
                        this.lastParen = null;
                        this.leftContext = new SubString(str, 0, index);
                        this.lastMatch = new SubString(str, index, slen);
                        this.rightContext = new SubString(str, index + slen, str.length() - index - slen);
                        val = Boolean.TRUE;
                    } else {
                        val = Boolean.FALSE;
                    }
                }

                if (data.charBuf == null) {
                    if (data.global || val == null
                        || !val.equals(Boolean.TRUE))
                    {
                        /* Didn't match even once. */
                        return data.str;
                    }
                    SubString lc = this.leftContext;
                    replace_glob(data, cx, scope, this, lc.index, lc.length);
                }
                SubString rc = this.rightContext;
                data.charBuf.append(rc.str, rc.index, rc.index + rc.length);
                return data.charBuf.toString();
            }

          default:
            throw Kit.codeBug();
        }
    }

    private static NativeRegExp createRegExp(Context cx, Scriptable scope,
                                             Object[] args, int optarg,
                                             boolean forceFlat)
    {
        NativeRegExp re;
        Scriptable topScope = ScriptableObject.getTopLevelScope(scope);
        if (args.length == 0 || args[0] == Undefined.instance) {
            RECompiled compiled = NativeRegExp.compileRE(cx, "", "", false);
            re = new NativeRegExp(topScope, compiled);
        } else if (args[0] instanceof NativeRegExp) {
            re = (NativeRegExp) args[0];
        } else {
            String src = ScriptRuntime.toString(args[0]);
            String opt;
            if (optarg < args.length) {
                args[0] = src;
                opt = ScriptRuntime.toString(args[optarg]);
            } else {
                opt = null;
            }
            RECompiled compiled = NativeRegExp.compileRE(cx, src, opt, forceFlat);
            re = new NativeRegExp(topScope, compiled);
        }
        return re;
    }

    /**
     * Analog of C match_or_replace.
     */
    private static Object matchOrReplace(Context cx, Scriptable scope,
                                         Scriptable thisObj, Object[] args,
                                         RegExpImpl reImpl,
                                         GlobData data, NativeRegExp re)
    {
        String str = data.str;
        data.global = (re.getFlags() & NativeRegExp.JSREG_GLOB) != 0;
        int[] indexp = { 0 };
        Object result = null;
        if (data.mode == RA_SEARCH) {
            result = re.executeRegExp(cx, scope, reImpl,
                                      str, indexp, NativeRegExp.TEST);
            if (result != null && result.equals(Boolean.TRUE))
                result = Integer.valueOf(reImpl.leftContext.length);
            else
                result = Integer.valueOf(-1);
        } else if (data.global) {
            re.lastIndex = 0d;
            for (int count = 0; indexp[0] <= str.length(); count++) {
                result = re.executeRegExp(cx, scope, reImpl,
                                          str, indexp, NativeRegExp.TEST);
                if (result == null || !result.equals(Boolean.TRUE))
                    break;
                if (data.mode == RA_MATCH) {
                    match_glob(data, cx, scope, count, reImpl);
                } else {
                    if (data.mode != RA_REPLACE) Kit.codeBug();
                    SubString lastMatch = reImpl.lastMatch;
                    int leftIndex = data.leftIndex;
                    int leftlen = lastMatch.index - leftIndex;
                    data.leftIndex = lastMatch.index + lastMatch.length;
                    replace_glob(data, cx, scope, reImpl, leftIndex, leftlen);
                }
                if (reImpl.lastMatch.length == 0) {
                    if (indexp[0] == str.length())
                        break;
                    indexp[0]++;
                }
            }
        } else {
            result = re.executeRegExp(cx, scope, reImpl, str, indexp,
                                      ((data.mode == RA_REPLACE)
                                       ? NativeRegExp.TEST
                                       : NativeRegExp.MATCH));
        }

        return result;
    }



    public int find_split(Context cx, Scriptable scope, String target,
                          String separator, Scriptable reObj,
                          int[] ip, int[] matchlen,
                          boolean[] matched, String[][] parensp)
    {
        int i = ip[0];
        int length = target.length();
        int result;

        int version = cx.getLanguageVersion();
        NativeRegExp re = (NativeRegExp) reObj;
        again:
        while (true) {  // imitating C label
            /* JS1.2 deviated from Perl by never matching at end of string. */
            int ipsave = ip[0]; // reuse ip to save object creation
            ip[0] = i;
            Object ret = re.executeRegExp(cx, scope, this, target, ip,
                                          NativeRegExp.TEST);
            if (ret != Boolean.TRUE) {
                // Mismatch: ensure our caller advances i past end of string.
                ip[0] = ipsave;
                matchlen[0] = 1;
                matched[0] = false;
                return length;
            }
            i = ip[0];
            ip[0] = ipsave;
            matched[0] = true;

            SubString sep = this.lastMatch;
            matchlen[0] = sep.length;
            if (matchlen[0] == 0) {
                /*
                 * Empty string match: never split on an empty
                 * match at the start of a find_split cycle.  Same
                 * rule as for an empty global match in
                 * match_or_replace.
                 */
                if (i == ip[0]) {
                    /*
                     * "Bump-along" to avoid sticking at an empty
                     * match, but don't bump past end of string --
                     * our caller must do that by adding
                     * sep->length to our return value.
                     */
                    if (i == length) {
                        if (version == Context.VERSION_1_2) {
                            matchlen[0] = 1;
                            result = i;
                        }
                        else
                            result = -1;
                        break;
                    }
                    i++;
                    continue again; // imitating C goto
                }
            }
            // PR_ASSERT((size_t)i >= sep->length);
            result = i - matchlen[0];
            break;
        }
        int size = (parens == null) ? 0 : parens.length;
        parensp[0] = new String[size];
        for (int num = 0; num < size; num++) {
            SubString parsub = getParenSubString(num);
            parensp[0][num] = parsub.toString();
        }
        return result;
    }

    /**
     * Analog of REGEXP_PAREN_SUBSTRING in C jsregexp.h.
     * Assumes zero-based; i.e., for $3, i==2
     */
    SubString getParenSubString(int i)
    {
        if (parens != null && i < parens.length) {
            SubString parsub = parens[i];
            if (parsub != null) {
                return parsub;
            }
        }
        return SubString.emptySubString;
    }

    /*
     * Analog of match_glob() in jsstr.c
     */
    private static void match_glob(GlobData mdata, Context cx,
                                   Scriptable scope, int count,
                                   RegExpImpl reImpl)
    {
        if (mdata.arrayobj == null) {
            mdata.arrayobj = cx.newArray(scope, 0);
        }
        SubString matchsub = reImpl.lastMatch;
        String matchstr = matchsub.toString();
        mdata.arrayobj.put(count, mdata.arrayobj, matchstr);
    }

    /*
     * Analog of replace_glob() in jsstr.c
     */
    private static void replace_glob(GlobData rdata, Context cx,
                                     Scriptable scope, RegExpImpl reImpl,
                                     int leftIndex, int leftlen)
    {
        int replen;
        String lambdaStr;
        if (rdata.lambda != null) {
            // invoke lambda function with args lastMatch, $1, $2, ... $n,
            // leftContext.length, whole string.
            SubString[] parens = reImpl.parens;
            int parenCount = (parens == null) ? 0 : parens.length;
            Object[] args = new Object[parenCount + 3];
            args[0] = reImpl.lastMatch.toString();
            for (int i=0; i < parenCount; i++) {
                SubString sub = parens[i];
                if (sub != null) {
                    args[i+1] = sub.toString();
                } else {
                    args[i+1] = Undefined.instance;
                }
            }
            args[parenCount+1] = Integer.valueOf(reImpl.leftContext.length);
            args[parenCount+2] = rdata.str;
            // This is a hack to prevent expose of reImpl data to
            // JS function which can run new regexps modifing
            // regexp that are used later by the engine.
            // TODO: redesign is necessary
            if (reImpl != ScriptRuntime.getRegExpProxy(cx)) Kit.codeBug();
            RegExpImpl re2 = new RegExpImpl();
            re2.multiline = reImpl.multiline;
            re2.input = reImpl.input;
            ScriptRuntime.setRegExpProxy(cx, re2);
            try {
                Scriptable parent = ScriptableObject.getTopLevelScope(scope);
                Object result = rdata.lambda.call(cx, parent, parent, args);
                lambdaStr = ScriptRuntime.toString(result);
            } finally {
                ScriptRuntime.setRegExpProxy(cx, reImpl);
            }
            replen = lambdaStr.length();
        } else {
            lambdaStr = null;
            replen = rdata.repstr.length();
            if (rdata.dollar >= 0) {
                int[] skip = new int[1];
                int dp = rdata.dollar;
                do {
                    SubString sub = interpretDollar(cx, reImpl, rdata.repstr,
                                                    dp, skip);
                    if (sub != null) {
                        replen += sub.length - skip[0];
                        dp += skip[0];
                    } else {
                        ++dp;
                    }
                    dp = rdata.repstr.indexOf('$', dp);
                } while (dp >= 0);
            }
        }

        int growth = leftlen + replen + reImpl.rightContext.length;
        StringBuilder charBuf = rdata.charBuf;
        if (charBuf == null) {
            charBuf = new StringBuilder(growth);
            rdata.charBuf = charBuf;
        } else {
            charBuf.ensureCapacity(rdata.charBuf.length() + growth);
        }

        charBuf.append(reImpl.leftContext.str, leftIndex, leftIndex + leftlen);
        if (rdata.lambda != null) {
            charBuf.append(lambdaStr);
        } else {
            do_replace(rdata, cx, reImpl);
        }
    }

    private static SubString interpretDollar(Context cx, RegExpImpl res,
                                             String da, int dp, int[] skip)
    {
        char dc;
        int num, tmp;

        if (da.charAt(dp) != '$') Kit.codeBug();

        /* Allow a real backslash (literal "\\") to escape "$1" etc. */
        int version = cx.getLanguageVersion();
        if (version != Context.VERSION_DEFAULT
            && version <= Context.VERSION_1_4)
        {
            if (dp > 0 && da.charAt(dp - 1) == '\\')
                return null;
        }
        int daL = da.length();
        if (dp + 1 >= daL)
            return null;
        /* Interpret all Perl match-induced dollar variables. */
        dc = da.charAt(dp + 1);
        if (NativeRegExp.isDigit(dc)) {
            int cp;
            if (version != Context.VERSION_DEFAULT
                && version <= Context.VERSION_1_4)
            {
                if (dc == '0')
                    return null;
                /* Check for overflow to avoid gobbling arbitrary decimal digits. */
                num = 0;
                cp = dp;
                while (++cp < daL && NativeRegExp.isDigit(dc = da.charAt(cp)))
                {
                    tmp = 10 * num + (dc - '0');
                    if (tmp < num)
                        break;
                    num = tmp;
                }
            }
            else {  /* ECMA 3, 1-9 or 01-99 */
                int parenCount = (res.parens == null) ? 0 : res.parens.length;
                num = dc - '0';
                if (num > parenCount)
                    return null;
                cp = dp + 2;
                if ((dp + 2) < daL) {
                    dc = da.charAt(dp + 2);
                    if (NativeRegExp.isDigit(dc)) {
                        tmp = 10 * num + (dc - '0');
                        if (tmp <= parenCount) {
                            cp++;
                            num = tmp;
                        }
                    }
                }
                if (num == 0) return null;  /* $0 or $00 is not valid */
            }
            /* Adjust num from 1 $n-origin to 0 array-index-origin. */
            num--;
            skip[0] = cp - dp;
            return res.getParenSubString(num);
        }

        skip[0] = 2;
        switch (dc) {
          case '$':
            return new SubString("$");
          case '&':
            return res.lastMatch;
          case '+':
            return res.lastParen;
          case '`':
            if (version == Context.VERSION_1_2) {
                /*
                 * JS1.2 imitated the Perl4 bug where left context at each step
                 * in an iterative use of a global regexp started from last match,
                 * not from the start of the target string.  But Perl4 does start
                 * $` at the beginning of the target string when it is used in a
                 * substitution, so we emulate that special case here.
                 */
                res.leftContext.index = 0;
                res.leftContext.length = res.lastMatch.index;
            }
            return res.leftContext;
          case '\'':
            return res.rightContext;
        }
        return null;
    }

    /**
     * Analog of do_replace in jsstr.c
     */
    private static void do_replace(GlobData rdata, Context cx,
                                   RegExpImpl regExpImpl)
    {
        StringBuilder charBuf = rdata.charBuf;
        int cp = 0;
        String da = rdata.repstr;
        int dp = rdata.dollar;
        if (dp != -1) {
            int[] skip = new int[1];
            do {
                int len = dp - cp;
                charBuf.append(da.substring(cp, dp));
                cp = dp;
                SubString sub = interpretDollar(cx, regExpImpl, da,
                                                dp, skip);
                if (sub != null) {
                    len = sub.length;
                    if (len > 0) {
                        charBuf.append(sub.str, sub.index, sub.index + len);
                    }
                    cp += skip[0];
                    dp += skip[0];
                } else {
                    ++dp;
                }
                dp = da.indexOf('$', dp);
            } while (dp >= 0);
        }
        int daL = da.length();
        if (daL > cp) {
            charBuf.append(da.substring(cp, daL));
        }
    }

    /*
     * See ECMA 15.5.4.8.  Modified to match JS 1.2 - optionally takes
     * a limit argument and accepts a regular expression as the split
     * argument.
     */
    public Object js_split(Context cx, Scriptable scope,
                                   String target, Object[] args)
    {
        // create an empty Array to return;
        Scriptable result = cx.newArray(scope, 0);

        // Use the second argument as the split limit, if given.
        boolean limited = (args.length > 1) && (args[1] != Undefined.instance);
        long limit = 0;  // Initialize to avoid warning.
        if (limited) {
            /* Clamp limit between 0 and 1 + string length. */
            limit = ScriptRuntime.toUint32(args[1]);
            if (limit > target.length())
                limit = 1 + target.length();
        }

        // return an array consisting of the target if no separator given
        if (args.length < 1 || args[0] == Undefined.instance) {
            result.put(0, result, target);
            return result;
        }

        String separator = null;
        int[] matchlen = new int[1];
        Scriptable re = null;
        RegExpProxy reProxy = null;
        if (args[0] instanceof Scriptable) {
            reProxy = ScriptRuntime.getRegExpProxy(cx);
            if (reProxy != null) {
                Scriptable test = (Scriptable)args[0];
                if (reProxy.isRegExp(test)) {
                    re = test;
                }
            }
        }
        if (re == null) {
            separator = ScriptRuntime.toString(args[0]);
            matchlen[0] = separator.length();
        }

        // split target with separator or re
        int[] ip = { 0 };
        int match;
        int len = 0;
        boolean[] matched = { false };
        String[][] parens = { null };
        int version = cx.getLanguageVersion();
        while ((match = find_split(cx, scope, target, separator, version,
                                   reProxy, re, ip, matchlen, matched, parens))
               >= 0)
        {
            if ((limited && len >= limit) || (match > target.length()))
                break;

            String substr;
            if (target.length() == 0)
                substr = target;
            else
                substr = target.substring(ip[0], match);

            result.put(len, result, substr);
            len++;
        /*
         * Imitate perl's feature of including parenthesized substrings
         * that matched part of the delimiter in the new array, after the
         * split substring that was delimited.
         */
            if (re != null && matched[0] == true) {
                int size = parens[0].length;
                for (int num = 0; num < size; num++) {
                    if (limited && len >= limit)
                        break;
                    result.put(len, result, parens[0][num]);
                    len++;
                }
                matched[0] = false;
            }
            ip[0] = match + matchlen[0];

            if (version < Context.VERSION_1_3
                && version != Context.VERSION_DEFAULT)
            {
        /*
         * Deviate from ECMA to imitate Perl, which omits a final
         * split unless a limit argument is given and big enough.
         */
                if (!limited && ip[0] == target.length())
                    break;
            }
        }
        return result;
    }

    /*
     * Used by js_split to find the next split point in target,
     * starting at offset ip and looking either for the given
     * separator substring, or for the next re match.  ip and
     * matchlen must be reference variables (assumed to be arrays of
     * length 1) so they can be updated in the leading whitespace or
     * re case.
     *
     * Return -1 on end of string, >= 0 for a valid index of the next
     * separator occurrence if found, or the string length if no
     * separator is found.
     */
    private static int find_split(Context cx, Scriptable scope, String target,
                                  String separator, int version,
                                  RegExpProxy reProxy, Scriptable re,
                                  int[] ip, int[] matchlen, boolean[] matched,
                                  String[][] parensp)
    {
        int i = ip[0];
        int length = target.length();

        /*
         * Perl4 special case for str.split(' '), only if the user has selected
         * JavaScript1.2 explicitly.  Split on whitespace, and skip leading w/s.
         * Strange but true, apparently modeled after awk.
         */
        if (version == Context.VERSION_1_2 &&
            re == null && separator.length() == 1 && separator.charAt(0) == ' ')
        {
            /* Skip leading whitespace if at front of str. */
            if (i == 0) {
                while (i < length && Character.isWhitespace(target.charAt(i)))
                    i++;
                ip[0] = i;
            }

            /* Don't delimit whitespace at end of string. */
            if (i == length)
                return -1;

            /* Skip over the non-whitespace chars. */
            while (i < length
                   && !Character.isWhitespace(target.charAt(i)))
                i++;

            /* Now skip the next run of whitespace. */
            int j = i;
            while (j < length && Character.isWhitespace(target.charAt(j)))
                j++;

            /* Update matchlen to count delimiter chars. */
            matchlen[0] = j - i;
            return i;
        }

        /*
         * Stop if past end of string.  If at end of string, we will
         * return target length, so that
         *
         *  "ab,".split(',') => new Array("ab", "")
         *
         * and the resulting array converts back to the string "ab,"
         * for symmetry.  NB: This differs from perl, which drops the
         * trailing empty substring if the LIMIT argument is omitted.
         */
        if (i > length)
            return -1;

        /*
         * Match a regular expression against the separator at or
         * above index i.  Return -1 at end of string instead of
         * trying for a match, so we don't get stuck in a loop.
         */
        if (re != null) {
            return reProxy.find_split(cx, scope, target, separator, re,
                                      ip, matchlen, matched, parensp);
        }

        /*
         * Deviate from ECMA by never splitting an empty string by any separator
         * string into a non-empty array (an array of length 1 that contains the
         * empty string).
         */
        if (version != Context.VERSION_DEFAULT && version < Context.VERSION_1_3
            && length == 0)
            return -1;

        /*
         * Special case: if sep is the empty string, split str into
         * one character substrings.  Let our caller worry about
         * whether to split once at end of string into an empty
         * substring.
         *
         * For 1.2 compatibility, at the end of the string, we return the length as
         * the result, and set the separator length to 1 -- this allows the caller
         * to include an additional null string at the end of the substring list.
         */
        if (separator.length() == 0) {
            if (version == Context.VERSION_1_2) {
                if (i == length) {
                    matchlen[0] = 1;
                    return i;
                }
                return i + 1;
            }
            return (i == length) ? -1 : i + 1;
        }

        /* Punt to j.l.s.indexOf; return target length if separator is
         * not found.
         */
        if (ip[0] >= length)
            return length;

        i = target.indexOf(separator, ip[0]);

        return (i != -1) ? i : length;
    }

    protected String          input;         /* input string to match (perl $_, GC root) */
    protected boolean         multiline;     /* whether input contains newlines (perl $*) */
    protected SubString[]     parens;        /* Vector of SubString; last set of parens
                                      matched (perl $1, $2) */
    protected SubString       lastMatch;     /* last string matched (perl $&) */
    protected SubString       lastParen;     /* last paren matched (perl $+) */
    protected SubString       leftContext;   /* input to left of last match (perl $`) */
    protected SubString       rightContext;  /* input to right of last match (perl $') */
}


final class GlobData
{
    int      mode;      /* input: return index, match object, or void */
    boolean  global;    /* output: whether regexp was global */
    String   str;       /* output: 'this' parameter object as string */

    // match-specific data

    Scriptable arrayobj;

    // replace-specific data

    Function      lambda;        /* replacement function object or null */
    String        repstr;        /* replacement string */
    int           dollar = -1;   /* -1 or index of first $ in repstr */
    StringBuilder charBuf;       /* result characters, null initially */
    int           leftIndex;     /* leftContext index, always 0 for JS1.2 */
}