package water.rapids.ast.prims.string;
import water.MRTask;
import water.fvec.*;
import water.parser.BufferedString;
import water.rapids.Env;
import water.rapids.ast.AstPrimitive;
import water.rapids.ast.AstRoot;
import water.rapids.vals.ValFrame;
public class AstTokenize extends AstPrimitive {
@Override
public String[] args() {
return new String[]{"ary", "regex"};
}
@Override
public int nargs() {
return 1 + 2;
} // (tokenize x regex)
@Override
public String str() {
return "tokenize";
}
@Override
public ValFrame apply(Env env, Env.StackHelp stk, AstRoot asts[]) {
Frame fr = stk.track(asts[1].exec(env)).getFrame();
String regex = asts[2].exec(env).getStr();
// Type check
for (Vec v : fr.vecs())
if (! v.isString())
throw new IllegalArgumentException("tokenize() requires all input columns to be of a String type. "
+ "Received " + fr.anyVec().get_type_str() + ". Please convert column to a string column first.");
Frame tokenized = new Tokenizer(regex).doAll(Vec.T_STR, fr).outputFrame();
return new ValFrame(tokenized);
}
private static class Tokenizer extends MRTask<Tokenizer> {
private final String _regex;
public Tokenizer(String regex) {
_regex = regex;
}
@Override
public void map(Chunk[] cs, NewChunk nc) {
BufferedString tmpStr = new BufferedString();
for (int row = 0; row < cs[0]._len; row++) {
for (Chunk chk : cs) {
if (chk.isNA(row)) continue;
String[] ss = chk.atStr(tmpStr, row).toString().split(_regex);
for (String s : ss)
nc.addStr(s);
}
nc.addNA();
}
}
}
}