package water.rapids.ast.prims.string; import org.apache.commons.lang.StringUtils; import water.MRTask; import water.fvec.*; import water.parser.BufferedString; import water.rapids.Env; import water.rapids.Val; import water.rapids.vals.ValFrame; import water.rapids.ast.AstPrimitive; import water.rapids.ast.AstRoot; import water.util.VecUtils; import java.util.ArrayList; import java.util.HashMap; /** * Accepts a frame with a single string column. * Returns a new string column containing the rstripped versions of the strings in the target column. * Stripping removes all characters in the strings for the target columns that match the user provided set */ public class AstRStrip extends AstPrimitive { @Override public String[] args() { return new String[]{"ary", "set"}; } @Override public int nargs() { return 1 + 2; } @Override public String str() { return "rstrip"; } @Override public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) { Frame fr = stk.track(asts[1].exec(env)).getFrame(); String set = asts[2].exec(env).getStr(); // Type check for (Vec v : fr.vecs()) if (!(v.isCategorical() || v.isString())) throw new IllegalArgumentException("trim() requires a string or categorical column. " + "Received " + fr.anyVec().get_type_str() + ". Please convert column to a string or categorical first."); // Transform each vec Vec nvs[] = new Vec[fr.numCols()]; int i = 0; for (Vec v : fr.vecs()) { if (v.isCategorical()) nvs[i] = rstripCategoricalCol(v, set); else nvs[i] = rstripStringCol(v, set); i++; } return new ValFrame(new Frame(nvs)); } private Vec rstripCategoricalCol(Vec vec, String set) { String[] doms = vec.domain().clone(); HashMap<String, ArrayList<Integer>> strippedToOldDomainIndices = new HashMap<>(); String stripped; for (int i = 0; i < doms.length; i++) { stripped = StringUtils.stripEnd(doms[i], set); doms[i] = stripped; if (!strippedToOldDomainIndices.containsKey(stripped)) { ArrayList<Integer> val = new ArrayList<>(); val.add(i); strippedToOldDomainIndices.put(stripped, val); } else { strippedToOldDomainIndices.get(stripped).add(i); } } //Check for duplicated domains if (strippedToOldDomainIndices.size() < doms.length) return VecUtils.DomainDedupe.domainDeduper(vec, strippedToOldDomainIndices); return vec.makeCopy(doms); } private Vec rstripStringCol(Vec vec, String set) { final String charSet = set; return new MRTask() { @Override public void map(Chunk chk, NewChunk newChk) { if (chk instanceof C0DChunk) // all NAs for (int i = 0; i < chk.len(); i++) newChk.addNA(); else if (((CStrChunk) chk)._isAllASCII && StringUtils.isAsciiPrintable(charSet)) { // fast-path operations ((CStrChunk) chk).asciiRStrip(newChk, charSet); } else { BufferedString tmpStr = new BufferedString(); for (int i = 0; i < chk.len(); i++) { if (chk.isNA(i)) newChk.addNA(); else newChk.addStr(StringUtils.stripEnd(chk.atStr(tmpStr, i).toString(), charSet)); } } } }.doAll(new byte[]{Vec.T_STR}, vec).outputFrame().anyVec(); } }