package edu.isi.karma.cleaning; import java.io.File; import java.io.FileReader; import java.util.Collection; import java.util.HashMap; import java.util.Vector; import org.apache.mahout.math.Arrays; import au.com.bytecode.opencsv.CSVReader; public class Test { public static void test1() { Vector<String[]> examples = new Vector<String[]>(); String[] xStrings = {"<_START>Technische Universität Berlin, DAI-Labor, Berlin, Germany<_END>", "Technische Universität Berlin"}; String[] yStrings = {"<_START>Stanford University, Stanford, California, United States<_END>", "Stanford University"}; //String[] zStrings = {"<_START>Faculdade de Ciências, Universidade de Lisboa, Lisboa, Portugal<_END>", "Portugal"}; examples.add(xStrings); System.out.println("length: "+xStrings[1].length()); examples.add(yStrings); //examples.add(zStrings); ProgSynthesis psProgSynthesis = new ProgSynthesis(); psProgSynthesis.inite(examples); Vector<ProgramRule> pls = new Vector<ProgramRule>(); Collection<ProgramRule> ps = psProgSynthesis.run_main(); ProgramRule pr = ps.iterator().next(); String val = "Faculdade de Ciências, Universidade de Lisboa, Lisboa, Portugal"; InterpreterType rule = pr.getRuleForValue(val); System.out.println(rule.execute(val)); } //check whether it longest or shortest public static boolean visible(HashMap<String, String[]> xHashMap,String Id) { String[] pair = xHashMap.get(Id); HashMap<String, String> tmp = new HashMap<String, String>(); UtilTools.StringColorCode(pair[0], pair[1], tmp); String tar = tmp.get("Tar"); int length = tar.length(); boolean shortest = true; boolean longest = true; for (String[] elem : xHashMap.values()) { HashMap<String, String> t = new HashMap<String, String>(); UtilTools.StringColorCode(elem[0], elem[1], t); String tres = tmp.get("Tar"); int newl = tres.length(); if (newl > length) { longest = false; } if (newl <length) { shortest = false; } } return (shortest || longest); } public static void test4(String dirpath) { HashMap<String, Vector<String>> records = new HashMap<String, Vector<String>>(); File nf = new File(dirpath); File[] allfiles = nf.listFiles(); // statistics DataCollection dCollection = new DataCollection(); // list all the csv file under the dir for (File f : allfiles) { Vector<String[]> examples = new Vector<String[]>(); Vector<String[]> addExamples = new Vector<String[]>(); Vector<String[]> entries = new Vector<String[]>(); try { if (f.getName().indexOf(".csv") == (f.getName().length() - 4)) { HashMap<String, String[]> xHashMap = new HashMap<String, String[]>(); CSVReader cr = new CSVReader(new FileReader(f), ',','"','\0'); String[] pair; int index = 0; while ((pair = cr.readNext()) != null) { if (pair == null || pair.length <= 1) break; entries.add(pair); String[] line = {pair[0],pair[1],"","","wrong"}; // org, tar, tarcode, label xHashMap.put(index + "", line); index++; } if (entries.size() <= 1) continue; ExampleSelection expsel = new ExampleSelection(); expsel.firsttime = true; expsel.inite(xHashMap,null); int target = Integer.parseInt(expsel.Choose()); String[] mt = { "<_START>" + entries.get(target)[0] + "<_END>", entries.get(target)[1] }; examples.add(mt); expsel.firsttime = false; while (true) // repeat as no correct answer appears. { long checknumber = 1; long iterAfterNoFatalError = 1; HashMap<String, Vector<String[]>> expFeData = new HashMap<String, Vector<String[]>>(); Vector<String> resultString = new Vector<String>(); xHashMap = new HashMap<String, String[]>(); ProgSynthesis psProgSynthesis = new ProgSynthesis(); psProgSynthesis.inite(examples); Vector<ProgramRule> pls = new Vector<ProgramRule>(); Collection<ProgramRule> ps = psProgSynthesis.run_main(); if (ps != null) pls.addAll(ps); else { System.out.println("Cannot find any rule"); } String[] wexam = null; if (pls.size() == 0) break; long t1 = System.currentTimeMillis(); for (int i = 0; i < pls.size(); i++) { ProgramRule script = pls.get(i); // System.out.println(script); String res = ""; for (int j = 0; j < entries.size(); j++) { InterpreterType worker = script .getRuleForValue(entries.get(j)[0]); String classlabel = script.getClassForValue(entries.get(j)[0]); String tmps = worker.execute_debug(entries.get(j)[0]); HashMap<String, String> dict = new HashMap<String, String>(); UtilTools.StringColorCode(entries.get(j)[0], tmps, dict); String s = dict.get("Tar"); res += s+"\n"; if (ConfigParameters.debug == 1) { System.out.println("result: " + dict.get("Org")+" "+dict.get("Tardis")); } if (s == null || s.length() == 0) { String[] ts = {"<_START>" + entries.get(j)[0] + "<_END>","",tmps,classlabel,"wrong"}; xHashMap.put(j + "", ts); wexam = ts; checknumber ++; } boolean isfind = false; for(String[] exppair:examples) { if(exppair[0].compareTo("<_START>"+dict.get("Org")+"<_END>")==0) { String[] exp = {dict.get("Org"),tmps}; if(!expFeData.containsKey(classlabel)) { Vector<String[]> vstr = new Vector<String[]>(); vstr.add(exp); expFeData.put(classlabel, vstr); } else { expFeData.get(classlabel).add(exp); } isfind = true; } } //update positive traing data with user specification for (String[] tmpx : addExamples) { if(tmpx[0].compareTo(dict.get("Org"))==0 && tmpx[1].compareTo(dict.get("Tar"))==0) { String[] exp = {dict.get("Org"),tmps}; if(!expFeData.containsKey(classlabel)) { Vector<String[]> vstr = new Vector<String[]>(); vstr.add(exp); expFeData.put(classlabel, vstr); } else { expFeData.get(classlabel).add(exp); } isfind = true; } } if (!isfind) { String[] ts = {"<_START>" + entries.get(j)[0] + "<_END>",s,tmps,classlabel,"right"}; if(s.compareTo(entries.get(j)[1]) != 0) { wexam = ts; ts[4] = "wrong"; } xHashMap.put(j + "", ts); } } if (wexam == null) break; resultString.add(res); } records.put(f.getName()+examples.size(), resultString); long t2 = System.currentTimeMillis(); if (wexam != null) { String[] wexp = new String[2]; while(true) { expsel = new ExampleSelection(); expsel.inite(xHashMap,expFeData); int e = Integer.parseInt(expsel.Choose()); /// System.out.println("Recommand Example: "+ Arrays.toString(xHashMap.get(""+e))); /// if(xHashMap.get(""+e)[4].compareTo("right")!=0) { wexp[0] = "<_START>" + entries.get(e)[0] + "<_END>"; wexp[1] = entries.get(e)[1]; if(expsel.isDetectingQuestionableRecord) { iterAfterNoFatalError ++; //check whether this record is has the longest or shortest result } break; } else { //update positive training data addExamples.add(entries.get(e)); //update the rest dataset xHashMap.remove(""+e); } checknumber ++; } examples.add(wexp); FileStat fileStat = new FileStat(f.getName(), psProgSynthesis.learnspan, psProgSynthesis.genspan, (t2 - t1), examples.size(), examples, psProgSynthesis.ruleNo,checknumber, pls.get(0).toString()); dCollection.addEntry(fileStat); } else { FileStat fileStat = new FileStat(f.getName(), psProgSynthesis.learnspan, psProgSynthesis.genspan, (t2 - t1), examples.size(), examples, psProgSynthesis.ruleNo,checknumber, pls.get(0).toString()); dCollection.addEntry(fileStat); break; } } } } catch (Exception ex) { ex.printStackTrace(); } } dCollection.print(); dCollection.print1(); //hashResultPrint(records); } public static void hashResultPrint(HashMap<String, Vector<String>> res) { String s = ""; for(String key:res.keySet()) { s += "=============="+key+"=============\n"; for(String value: res.get(key)) { s += value+"\n"; } } System.out.println(""+s); } public static void test0_sub(Vector<String[]> all, Vector<String[]> cand, Vector<String[]> examples, int cnt) { if (cand.size() <= 0) { if (cnt > Test.MaximalNumber) { MaximalNumber = cnt; Test.larexamples = examples; } if (cnt < Test.MinimalNumber) { MinimalNumber = cnt; Test.smalexamples = examples; } //System.out.println("returned"); return; } for (int p = 0; p < cand.size(); p++) { Vector<String[]> tmp = new Vector<String[]>(); tmp.addAll(examples); String[] x = { "<_START>" + cand.get(p)[0] + "<_END>", cand.get(p)[1] }; tmp.add(x); Vector<String[]> tmpxStrings = new Vector<String[]>(); ProgSynthesis psProgSynthesis = new ProgSynthesis(); psProgSynthesis.inite(tmp); Vector<ProgramRule> pls = new Vector<ProgramRule>(); Collection<ProgramRule> ps = psProgSynthesis.run_main(); if (ps != null) pls.addAll(ps); String[] wexam = null; if (pls.size() == 0) break; for (int i = 0; i < pls.size(); i++) { ProgramRule script = pls.get(i); for (int j = 0; j < all.size(); j++) { InterpreterType worker = script .getRuleForValue(all.get(j)[0]); String s = worker.execute(all.get(j)[0]); //System.out.println("result: " + s); if (s == null || s.length() == 0) { wexam = all.get(j); String[] ep = { "<_START>" + wexam[0] + "<_END>", wexam[1] }; tmpxStrings.add(ep); continue; } if (s.compareTo(all.get(j)[1]) != 0) { wexam = all.get(j); String[] ep = { "<_START>" + wexam[0] + "<_END>", wexam[1] }; tmpxStrings.add(ep); continue; } } } test0_sub(all, tmpxStrings, tmp, cnt + 1); } } public static int MaximalNumber = -1; public static int MinimalNumber = 100; public static Vector<String[]> larexamples = new Vector<String[]>(); public static Vector<String[]> smalexamples = new Vector<String[]>(); public static void test0(String dirpath) { File nf = new File(dirpath); File[] allfiles = nf.listFiles(); // statistics DataCollection dCollection = new DataCollection(); // list all the csv file under the dir for (File f : allfiles) { Vector<String[]> examples = new Vector<String[]>(); Vector<String[]> entries = new Vector<String[]>(); try { if (f.getName().indexOf(".csv") == (f.getName().length() - 4)) { CSVReader cr = new CSVReader(new FileReader(f), ',', '"', '\0'); String[] pair; while ((pair = cr.readNext()) != null) { if (pair == null || pair.length <= 1) break; entries.add(pair); } if (entries.size() <= 1) continue; int cnt = 0; Vector<String[]> candStrings = new Vector<String[]>(); candStrings.addAll(entries); test0_sub(entries, candStrings, examples, cnt); System.out.println("File " + f.getName() + "\n"); System.out.println("Max: " + Test.MaximalNumber); System.out.println("Min: " + Test.MinimalNumber); String str = "Larget number of Examples:\n"; for (int x = 0; x < Test.larexamples.size(); x++) { str += String.format("exp: %s, %s\n", larexamples.get(x)[0], larexamples.get(x)[1]); } System.out.println("Largest: " + str); String str1 = "Smallest number of Examples:\n"; for (int x = 0; x < Test.smalexamples.size(); x++) { str1 += String.format("exp: %s, %s\n", smalexamples.get(x)[0], smalexamples.get(x)[1]); } System.out.println("Smallest: " + str1); //clear Test.MaximalNumber = -1; Test.larexamples = new Vector<String[]>(); Test.MinimalNumber = 200; Test.smalexamples = new Vector<String[]>(); } // } catch (Exception e) { System.out.println("" + e.toString()); } } } public static void main(String[] args) { // load parameters ConfigParameters cfg = new ConfigParameters(); cfg.initeParameters(); DataCollection.config = cfg.getString(); //Test.test0("/Users/bowu/Research/testdata/TestSingleFile"); Test.test4("/Users/bowu/Research/testdata/TestSingleFile"); //Test.test1(); } }