/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California. For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/
package edu.isi.karma.controller.command.cleaning;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.Vector;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.SimpleLayout;
import org.json.JSONArray;
import org.json.JSONObject;
import au.com.bytecode.opencsv.CSVReader;
import edu.isi.karma.cleaning.ConfigParameters;
import edu.isi.karma.cleaning.DataCollection;
import edu.isi.karma.cleaning.ExampleSelection;
import edu.isi.karma.cleaning.Ruler;
import edu.isi.karma.cleaning.TNode;
import edu.isi.karma.cleaning.UtilTools;
import edu.isi.karma.controller.command.CommandException;
import edu.isi.karma.controller.command.WorksheetCommand;
import edu.isi.karma.controller.update.CleaningResultUpdate;
import edu.isi.karma.controller.update.UpdateContainer;
import edu.isi.karma.rep.HNodePath;
import edu.isi.karma.rep.Node;
import edu.isi.karma.rep.Worksheet;
import edu.isi.karma.rep.cleaning.RamblerTransformationExample;
import edu.isi.karma.rep.cleaning.RamblerTransformationInputs;
import edu.isi.karma.rep.cleaning.RamblerTransformationOutput;
import edu.isi.karma.rep.cleaning.RamblerValueCollection;
import edu.isi.karma.rep.cleaning.TransformationExample;
import edu.isi.karma.rep.cleaning.ValueCollection;
import edu.isi.karma.view.VWorkspace;
import edu.isi.karma.webserver.ServletContextParameterMap;
import edu.isi.karma.webserver.ServletContextParameterMap.ContextParameter;
public class GenerateCleaningRulesCommand extends WorksheetCommand {
final String hNodeId;
private Vector<TransformationExample> examples;
private HashSet<String> nodeIds = new HashSet<String>();
RamblerTransformationInputs inputs;
public String compResultString = "";
public GenerateCleaningRulesCommand(String id, String worksheetId,
String hNodeId, String examples, String cellIDs) {
super(id, worksheetId);
this.hNodeId = hNodeId;
this.nodeIds = parseNodeIds(cellIDs);
ConfigParameters cfg = new ConfigParameters();
cfg.initeParameters();
DataCollection.config = cfg.getString();
this.examples = parseExample(examples);
////log info
try
{
FileAppender appender = new FileAppender(new SimpleLayout(),"./log/cleanning.log");
logger.addAppender(appender);
}
catch (Exception e) {
}
}
private HashSet<String> parseNodeIds(String Ids) {
HashSet<String> tSet = new HashSet<String>();
try {
JSONArray jsa = new JSONArray(Ids);
for (int i = 0; i < jsa.length(); i++) {
tSet.add(jsa.getString(i));
}
} catch (Exception e) {
System.out.println("" + e.toString());
}
return tSet;
}
public static Vector<TransformationExample> parseExample(String example) {
Vector<TransformationExample> x = new Vector<TransformationExample>();
try {
JSONArray jsa = new JSONArray(example);
for (int i = 0; i < jsa.length(); i++) {
String[] ary = new String[3];
JSONObject jo = (JSONObject) jsa.get(i);
String nodeid = (String) jo.get("nodeId");
String before = (String) jo.getString("before");
String after = (String) jo.getString("after");
ary[0] = nodeid;
ary[1] = "<_START>" + before + "<_END>";
ary[2] = after;
TransformationExample re = new RamblerTransformationExample(
ary[1], ary[2], ary[0]);
x.add(re);
}
} catch (Exception ex) {
System.out.println("" + ex.toString());
}
return x;
}
private String getBestExample(HashMap<String, String[]> xHashMap,
HashMap<String, Vector<String[]>> expFeData) {
String ID = "";
ExampleSelection es = new ExampleSelection();
es.inite(xHashMap, expFeData);
return es.Choose();
}
/* private static Vector<String> getTopK(Set<String> res, int k, String cmpres) {
String dirpathString = ServletContextParameterMap
.getParameterValue(ContextParameter.USER_DIRECTORY_PATH);
if (dirpathString.compareTo("") == 0) {
dirpathString = "./src/main/webapp/";
}
String trainPath = dirpathString + "grammar/features.arff";
//
String[] x = (String[]) res.toArray(new String[res.size()]);
System.out.println("" + x);
// Vector<Double> scores = UtilTools.getScores(x, trainPath);
Vector<Double> scores = UtilTools.getScores2(x, cmpres);
System.out.println("Scores: " + scores);
Vector<Integer> ins = UtilTools.topKindexs(scores, k);
System.out.println("Indexs: " + ins);
Vector<String> y = new Vector<String>();
for (int i = 0; i < k && i < ins.size(); i++) {
y.add(x[ins.get(i)]);
}
return y;
}*/
@Override
public String getCommandName() {
return GenerateCleaningRulesCommand.class.getSimpleName();
}
@Override
public String getTitle() {
return "Generate Cleaning Rules";
}
@Override
public String getDescription() {
// TODO Auto-generated method stub
return null;
}
@Override
public CommandType getCommandType() {
return CommandType.notInHistory;
}
public void StringColorCode(String org, String res,
HashMap<String, String> dict) {
int segmentCnt = 0;
Vector<int[]> allUpdates = new Vector<int[]>();
String pat = "((?<=\\{_L\\})|(?=\\{_L\\}))";
String pat1 = "((?<=\\{_S\\})|(?=\\{_S\\}))";
String orgdis = "";
String tardis = "";
String tar = "";
String[] st = res.split(pat);
boolean inloop = false;
for (String token : st) {
if (token.compareTo("{_L}") == 0 && !inloop) {
inloop = true;
continue;
}
if (token.compareTo("{_L}") == 0 && inloop) {
inloop = false;
continue;
}
String[] st1 = token.split(pat1);
for (String str : st1) {
if (str.compareTo("{_S}") == 0 || str.compareTo("{_S}") == 0) {
continue;
}
if (str.indexOf("{_C}") != -1) {
String[] pos = str.split("\\{_C\\}");
int[] poses = { Integer.valueOf(pos[0]),
Integer.valueOf(pos[1]),segmentCnt};
boolean findPos = false;
for (int i = 0; i < allUpdates.size(); i++) {
int[] cur = allUpdates.get(i);
if (poses[0] <= cur[0]) {
findPos = true;
allUpdates.add(i, poses);
break; // avoid infinite adding
}
}
if(!findPos)
{
allUpdates.add(poses);
}
String tarseg = org.substring(Integer.valueOf(pos[0]),
Integer.valueOf(pos[1]));
if (inloop) {
tardis += String.format(
"<span class=\"a%d\">%s</span>", segmentCnt,
tarseg);
// orgdis +=
// String.format("<span class=\"a%d\">%s</span>",
// segmentCnt,tarseg);
tar += tarseg;
} else {
tardis += String.format(
"<span class=\"a%d\">%s</span>", segmentCnt,
tarseg);
// orgdis +=
// String.format("<span class=\"a%d\">%s</span>",
// segmentCnt,tarseg);
segmentCnt++;
tar += tarseg;
}
} else {
tardis += String.format("<span class=\"ins\">%s</span>",
str);
tar += str;
}
}
}
int pre = 0;
for(int[] update:allUpdates)
{
if(update[0] >= pre)
{
orgdis += org.substring(pre,update[0]);
orgdis += String.format(
"<span class=\"a%d\">%s</span>", update[2],
org.substring(update[0],update[1]));
pre = update[1];
}
}
if(org.length() > pre)
orgdis += org.substring(pre);
dict.put("Org", org);
dict.put("Tar", tar);
dict.put("Orgdis", orgdis);
dict.put("Tardis", tardis);
}
private static Logger logger = Logger.getLogger(GenerateCleaningRulesCommand.class);
@Override
public UpdateContainer doIt(VWorkspace vWorkspace) throws CommandException {
Worksheet wk = vWorkspace.getRepFactory().getWorksheet(worksheetId);
String Msg = String.format("Gen rule start,Time:%d, Worksheet:%s",System.currentTimeMillis(),worksheetId);
logger.info(Msg);
// Get the HNode
HashMap<String, String> rows = new HashMap<String, String>();
HashMap<String, Integer> amb = new HashMap<String, Integer>();
boolean firstCol = true;
HNodePath selectedPath = null;
List<HNodePath> columnPaths = wk.getHeaders().getAllPaths();
for (HNodePath path : columnPaths) {
if (path.getLeaf().getId().equals(hNodeId)) {
selectedPath = path;
}
}
Collection<Node> nodes = new ArrayList<Node>();
wk.getDataTable().collectNodes(selectedPath, nodes);
for (Node node : nodes) {
String id = node.getId();
if (!this.nodeIds.contains(id))
continue;
String originalVal = node.getValue().asString();
rows.put(id, originalVal);
this.compResultString += originalVal + "\n";
calAmbScore(id, originalVal, amb);
}
RamblerValueCollection vc = new RamblerValueCollection(rows);
HashMap<String, Vector<String[]>> expFeData = new HashMap<String, Vector<String[]>>();
inputs = new RamblerTransformationInputs(examples, vc);
// generate the program
boolean results = false;
int iterNum = 0;
RamblerTransformationOutput rtf = null;
long time1 = System.currentTimeMillis();
while (iterNum < 1 && !results) // try to find an program within iterNum
{
rtf = new RamblerTransformationOutput(inputs);
if (rtf.getTransformations().keySet().size() > 0) {
results = true;
}
iterNum++;
}
long time2 = System.currentTimeMillis();
Iterator<String> iter = rtf.getTransformations().keySet().iterator();
long time6 = 0, time7 = 0;
// id:{org: tar: orgdis: tardis: }
HashMap<String, HashMap<String, String>> resdata = new HashMap<String, HashMap<String, String>>();
HashSet<String> keys = new HashSet<String>();
while (iter.hasNext()) {
long _time5 = System.currentTimeMillis();
String tpid = iter.next();
ValueCollection rvco = rtf.getTransformedValues_debug(tpid);
if (rvco == null)
continue;
long _time6 = System.currentTimeMillis();
// constructing displaying data
HashMap<String, String[]> xyzHashMap = new HashMap<String, String[]>();
for (String key : rvco.getNodeIDs()) {
HashMap<String, String> dict = new HashMap<String, String>();
// add to the example selection
boolean isExp = false;
String org = vc.getValue(key);
String classLabel = rvco.getClass(key);
String pretar = rvco.getValue(key);
String dummyValue = pretar;
if(pretar.indexOf("_FATAL_ERROR_")!= -1)
{
dummyValue = org;
}
this.StringColorCode(org, dummyValue, dict);
for (TransformationExample exp : examples) {
if (exp.getNodeId().compareTo(key) == 0) {
if (!expFeData.containsKey(classLabel)) {
Vector<String[]> vstr = new Vector<String[]>();
String[] texp = {dict.get("Org"), pretar};
vstr.add(texp);
expFeData.put(classLabel, vstr);
} else {
String[] texp = { dict.get("Org"), pretar };
expFeData.get(classLabel).add(texp);
}
isExp = true;
}
}
if (!isExp) {
String[] pair = { dict.get("Org"), dict.get("Tar"), pretar,
classLabel };
xyzHashMap.put(key, pair);
}
resdata.put(key, dict);
}
if(!rtf.nullRule)
keys.add(getBestExample(xyzHashMap, expFeData));
long _time7 = System.currentTimeMillis();
time6 += _time6 - _time5;
time7 = _time7 - _time6;
}
// find the best row
String vars = "";
String expstr = "";
String recmd = "";
for(TransformationExample x:examples)
{
expstr += String.format("%s|%s", x.getBefore(),x.getAfter());
}
expstr += "|";
if(rtf.nullRule)
{
keys.clear();
//keys.add("-2"); // "-2 indicates null rule"
}
if(!resdata.isEmpty() && !rtf.nullRule)
{
recmd = resdata.get(keys.iterator().next()).get("Org");
}
else
{
recmd = "";
}
Msg = String.format("Gen rule end, Time:%d, Worksheet:%s,Examples:%s,Recmd:%s",System.currentTimeMillis(),worksheetId,expstr,recmd);
logger.info(Msg);
return new UpdateContainer(new CleaningResultUpdate(hNodeId, resdata,
vars, keys));
}
public String getVarJSON(HashMap<String, HashSet<String>> values) {
JSONObject jsobj = new JSONObject();
try {
for (String key : values.keySet()) {
JSONArray jsonArray = new JSONArray();
HashSet<String> vs = values.get(key);
for (String v : vs)
jsonArray.put(v);
jsobj.put(key, jsonArray);
}
} catch (Exception e) {
System.out.println("value generation error");
}
return jsobj.toString();
}
public void calAmbScore(String id, String org, HashMap<String, Integer> amb) {
Ruler ruler = new Ruler();
ruler.setNewInput(org);
Vector<TNode> tNodes = ruler.vec;
int tcnt = 1;
for (int i = 0; i < tNodes.size(); i++) {
if (tNodes.get(i).text.compareTo(" ") == 0)
continue;
for (int j = 0; j > i && j < tNodes.size(); j++) {
if (tNodes.get(j).sameNode(tNodes.get(i))) {
tcnt++;
}
}
}
amb.put(id, tcnt);
}
public void updateCandiScore(ValueCollection rvco,
HashMap<String, HashMap<String, Integer>> values) {
Iterator<String> ids = rvco.getNodeIDs().iterator();
while (ids.hasNext()) {
String id = ids.next();
String value = rvco.getValue(id);
HashMap<String, Integer> dict;
if (values.containsKey(id)) {
dict = values.get(id);
} else {
dict = new HashMap<String, Integer>();
values.put(id, dict);
}
if (dict.containsKey(value)) {
dict.put(value, dict.get(value) + 1);
} else {
dict.put(value, 1);
}
}
return;
}
public HashMap<String, Double> getScore(HashMap<String, Integer> dicts,
HashMap<String, HashMap<String, Integer>> values, boolean sw) {
int topKsize = 1;
if (sw)
topKsize = Integer.MAX_VALUE;
HashMap<String, Double> topK = new HashMap<String, Double>();
Iterator<String> iditer = dicts.keySet().iterator();
while (iditer.hasNext()) {
String id = iditer.next();
int amb = dicts.get(id);
HashMap<String, Integer> hm = values.get(id);
int div = 0;
int squrecnt = 0;
Iterator<String> iters = hm.keySet().iterator();
while (iters.hasNext()) {
String value = iters.next();
squrecnt += Math.pow(hm.get(value), 2);
}
div = hm.keySet().size();
// double entro = squrecnt*1.0/div;
// double score = amb*1.0/entro;
double score = div;
if (topK.keySet().size() < topKsize && div > 1) {
topK.put(id, score);
} else {
String[] keys = topK.keySet().toArray(
new String[topK.keySet().size()]);
for (String key : keys) {
if (topK.get(key) < score) {
topK.remove(key);
topK.put(id, score);
}
}
}
}
return topK;
}
@Override
public UpdateContainer undoIt(VWorkspace vWorkspace) {
// TODO Auto-generated method stub
return null;
}
public static void main(String[] args) {
String dirpath = "/Users/bowu/Research/testdata/TestSingleFile";
File nf = new File(dirpath);
File[] allfiles = nf.listFiles();
for (File f : allfiles) {
try {
if (f.getName().indexOf(".csv") == (f.getName().length() - 4)) {
CSVReader cr = new CSVReader(new FileReader(f), '\t');
String[] pair;
int isadded = 0;
HashMap<String, String> tx = new HashMap<String, String>();
int i = 0;
Vector<TransformationExample> vrt = new Vector<TransformationExample>();
while ((pair = cr.readNext()) != null) {
pair[0] = "<_START>" + pair[0] + "<_END>";
tx.put(i + "", pair[0]);
if (isadded < 2) {
RamblerTransformationExample tmp = new RamblerTransformationExample(
pair[0], pair[1], i + "");
vrt.add(tmp);
isadded++;
}
i++;
}
RamblerValueCollection vc = new RamblerValueCollection(tx);
RamblerTransformationInputs inputs = new RamblerTransformationInputs(
vrt, vc);
// generate the program
RamblerTransformationOutput rtf = new RamblerTransformationOutput(
inputs);
HashMap<String, Vector<String>> js2tps = new HashMap<String, Vector<String>>();
Iterator<String> iter = rtf.getTransformations().keySet()
.iterator();
Vector<ValueCollection> vvc = new Vector<ValueCollection>();
while (iter.hasNext()) {
String tpid = iter.next();
ValueCollection rvco = rtf.getTransformedValues(tpid);
vvc.add(rvco);
String reps = rvco.getJson().toString();
if (js2tps.containsKey(reps)) {
js2tps.get(reps).add(tpid); // update the variance
// dic
} else {
Vector<String> tps = new Vector<String>();
tps.add(tpid);
js2tps.put(reps, tps);
}
}
// //////
if (js2tps.keySet().size() == 0) {
System.out.println("No Rules have been found");
return;
}
for (String s : js2tps.keySet()) {
System.out.println("" + s);
}
}
} catch (Exception ex) {
System.out.println("" + ex.toString());
}
}
}
}