package water.rapids.ast.prims.string; import water.MRTask; import water.fvec.*; import water.parser.BufferedString; import water.rapids.Env; import water.rapids.Val; import water.rapids.vals.ValFrame; import water.rapids.ast.AstPrimitive; import water.rapids.ast.AstRoot; import java.util.Locale; /** * Accepts a frame with a single string column, a regex pattern string, a replacement substring, * and a boolean to indicate whether to ignore the case of the target string. * Returns a new string column containing the results of the replaceAll method on each string * in the target column. * <p/> * replaceAll - Replaces each substring of this string that matches the given regular expression * with the given replacement. */ public class AstReplaceAll extends AstPrimitive { @Override public String[] args() { return new String[]{"ary", "pattern", "replacement", "ignore_case"}; } @Override public int nargs() { return 1 + 4; } // (sub x pattern replacement ignore.case) @Override public String str() { return "replaceall"; } @Override public Val apply(Env env, Env.StackHelp stk, AstRoot asts[]) { final String pattern = asts[2].exec(env).getStr(); final String replacement = asts[3].exec(env).getStr(); Frame fr = stk.track(asts[1].exec(env)).getFrame(); final boolean ignoreCase = asts[4].exec(env).getNum() == 1; // Type check for (Vec v : fr.vecs()) if (!(v.isCategorical() || v.isString())) throw new IllegalArgumentException("replaceall() requires a string or categorical column. " + "Received " + fr.anyVec().get_type_str() + ". Please convert column to a string or categorical first."); // Transform each vec Vec nvs[] = new Vec[fr.numCols()]; int i = 0; for (Vec v : fr.vecs()) { if (v.isCategorical()) nvs[i] = replaceAllCategoricalCol(v, pattern, replacement, ignoreCase); else nvs[i] = replaceAllStringCol(v, pattern, replacement, ignoreCase); i++; } return new ValFrame(new Frame(nvs)); } private Vec replaceAllCategoricalCol(Vec vec, String pattern, String replacement, boolean ignoreCase) { String[] doms = vec.domain().clone(); for (int i = 0; i < doms.length; ++i) doms[i] = ignoreCase ? doms[i].toLowerCase(Locale.ENGLISH).replaceAll(pattern, replacement) : doms[i].replaceAll(pattern, replacement); return vec.makeCopy(doms); } private Vec replaceAllStringCol(Vec vec, String pat, String rep, boolean ic) { final String pattern = pat; final String replacement = rep; final boolean ignoreCase = ic; return new MRTask() { @Override public void map(Chunk chk, NewChunk newChk) { if (chk instanceof C0DChunk) // all NAs for (int i = 0; i < chk.len(); i++) newChk.addNA(); else { // if (((CStrChunk)chk)._isAllASCII) { // fast-path operations // ((CStrChunk) chk).asciiReplaceAll(newChk); // } else { //UTF requires Java string methods for accuracy BufferedString tmpStr = new BufferedString(); for (int i = 0; i < chk._len; i++) { if (chk.isNA(i)) newChk.addNA(); else { if (ignoreCase) newChk.addStr(chk.atStr(tmpStr, i).toString().toLowerCase(Locale.ENGLISH).replaceAll(pattern, replacement)); else newChk.addStr(chk.atStr(tmpStr, i).toString().replaceAll(pattern, replacement)); } } } } }.doAll(new byte[]{Vec.T_STR}, vec).outputFrame().anyVec(); } }