/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.cleaning; import java.io.File; import java.io.FileReader; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.Vector; import au.com.bytecode.opencsv.CSVReader; import com.hp.hpl.jena.tdb.store.Hash; //select the most reprentative records from a huge list of rows //to ensure that the program learned from the subset could work correctly on the whole dataset // first support public class RecordDistiller { // {anchor:{"Id": , "Count": , "LefContext":[], "RigContext":[]} public static int cxt_limit = 3; public int totalnumber = 0; public HashMap<String,Anchor> anchors = new HashMap<String,Anchor>(); public void readRecord(String ID, Vector<TNode> record) { HashMap<String, Integer> curIndices = new HashMap<String, Integer>(); for(int i = 1; i< record.size()-1; i++) // skip the start and end token { TNode t = record.get(i); String type = t.getType(); String anchor = type; if(curIndices.containsKey(type)) { int cnt = curIndices.get(type); curIndices.put(type, cnt+1); anchor += cnt; } else { curIndices.put(type, 0); anchor += "0"; } //get left and right context String lcxt = ""; String rcxt = ""; for(int j = i; j<i+this.cxt_limit && j <record.size(); j++) { rcxt += record.get(j).getType(); } for(int j= i; j>=0&& j> i-cxt_limit; j--) { lcxt += record.get(j).getType(); } //update the anchor repository if(this.anchors.containsKey(anchor)) { Anchor an = this.anchors.get(anchor); an.IDs.add(ID); an.count += 1; an.lefCxt.put(ID, lcxt); an.rigCxt.put(ID, rcxt); } else { Vector<String> Ids = new Vector<String>(); Ids.add(ID); HashMap<String,String> vlcxt = new HashMap<String,String>(); vlcxt.put(ID,lcxt); HashMap<String,String> vrcxt = new HashMap<String,String>(); vrcxt.put(ID,rcxt); Anchor nan = new Anchor(anchor, Ids, 1, vlcxt, vrcxt); anchors.put(anchor, nan); } } } //identify the anchor tokens public void idenAnchor(int total) { Vector<String> dels = new Vector<String>(); for(String a:anchors.keySet()) { int count = anchors.get(a).count; // if an anchor appears in more 10% records, it's a valid anchor if(count*1.0/total < 0.1) { dels.add(a); } } for(String s:dels) { anchors.remove(s); } } // identify the representative records of one anchor. //minimal set public HashSet<String> idenAnchorRecords(String anchor) { HashMap<String,Vector<String>> lcxt2ids = new HashMap<String, Vector<String>>(); HashMap<String,Vector<String>> rcxt2ids = new HashMap<String, Vector<String>>(); for(String Id: this.anchors.get(anchor).lefCxt.keySet()) { String s =this.anchors.get(anchor).lefCxt.get(Id); boolean isnew = true; for(String elem:lcxt2ids.keySet()) { if(elem.indexOf(s) == 0) { s = elem; isnew = false; } } if(isnew) { Vector<String> vs = new Vector<String>(); vs.add(Id); lcxt2ids.put(s, vs); } else { lcxt2ids.get(s).add(Id); } } for(String Id: this.anchors.get(anchor).rigCxt.keySet()) { String s =this.anchors.get(anchor).rigCxt.get(Id); boolean isnew = true; for(String elem:rcxt2ids.keySet()) { if(elem.indexOf(s) == 0) { s = elem; isnew = false; } } if(isnew) { Vector<String> vs = new Vector<String>(); vs.add(Id); rcxt2ids.put(s, vs); } else { rcxt2ids.get(s).add(Id); } } // generate candiate set HashSet<String> result = new HashSet<String>(); for(String cxt: lcxt2ids.keySet()) { if(lcxt2ids.get(cxt).size() != 0) { String idString = lcxt2ids.get(cxt).get(0); if(!result.contains(idString)) { result.add(idString); } } } for(String cxt: rcxt2ids.keySet()) { if(rcxt2ids.get(cxt).size() != 0) { String idString = rcxt2ids.get(cxt).get(0); if(!result.contains(idString)) { result.add(idString); } } } return result; } //merge the record sets generated by each anchor //return the final Record ID list public HashSet<String> refineRecords() { HashSet<String> ids = new HashSet<String>(); // find the union of the ids of all anchors for(String anchor:this.anchors.keySet()) { HashSet<String> set = this.idenAnchorRecords(anchor); ids.addAll(set); } return ids; } public static void main(String[] args) { String dirpath = "/Users/bowu/Research/testdata/TestSingleFile"; RecordDistiller distiller = new RecordDistiller(); File nf = new File(dirpath); File[] allfiles = nf.listFiles(); for (File f : allfiles) { try { if (f.getName().indexOf(".csv") == (f.getName().length() - 4)) { CSVReader cr = new CSVReader(new FileReader(f), ',','"','\0'); String[] pair; int id = 0; HashMap<String, String> id2String = new HashMap<String, String>(); while ((pair = cr.readNext()) != null) { if (pair == null || pair.length <= 1) break; Ruler ruler = new Ruler(); ruler.setNewInput(pair[0]); distiller.readRecord(""+id, ruler.vec); id2String.put(""+id, pair[0]); id++; } distiller.idenAnchor(id2String.keySet().size()); HashSet<String> allids = distiller.refineRecords(); for(String xid: allids) { System.out.println(id2String.get(xid)); } double compressRate = (allids.size()*1.0)/id2String.keySet().size(); for(String name:distiller.anchors.keySet()) { System.out.println("Anchor: "+name); for(String dString: distiller.anchors.get(name).IDs) { System.out.print(" "+dString); } System.out.println("\n"); } System.out.println(""+compressRate); } } catch (Exception e) { System.out.println(""+e.toString()); } } } } class Anchor{ public String name; public Vector<String> IDs; public int count; // id 2 the left context public HashMap<String, String> lefCxt = new HashMap<String, String>(); // id 2 the right context public HashMap<String, String> rigCxt =new HashMap<String, String>(); public Anchor(String anchor,Vector<String> Ids,int count, HashMap<String, String> lcxt, HashMap<String, String> rcxt) { this.name = anchor; this.IDs = Ids; this.count = count; this.lefCxt = lcxt; this.rigCxt = rcxt; } }