/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.cleaning.QuestionableRecord; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Vector; import org.apache.mahout.common.parameters.StringParameter; import org.python.antlr.PythonParser.return_stmt_return; import edu.isi.karma.cleaning.RecFeature; import edu.isi.karma.cleaning.Ruler; import edu.isi.karma.cleaning.TNode; import edu.isi.karma.cleaning.UtilTools; public class OutlierDetector { public HashMap<String,double[]> rVectors = new HashMap<String,double[]>(); public double currentMax = -1; public HashSet<String> dict = new HashSet<String>(); public OutlierDetector() { } public double getDistance(double[] x,double[] y) { if(x.length != y.length) return Double.MAX_VALUE; double value = 0.0; for(int i = 0; i<x.length; i++) { value += Math.pow(x[i]-y[i], 2); } return Math.sqrt(value); } //find all the word appearing in org and tar records public void buildDict(Collection<String[]> data) { HashSet<String> xHashSet = new HashSet<String>(); for(String[] pair:data) { Ruler r = new Ruler(); r.setNewInput(pair[0]); Vector<TNode> v = r.vec; r.setNewInput(pair[1]); v.addAll(r.vec); for (TNode t : v) { if(! xHashSet.contains(t.text)) { xHashSet.add(t.text); } } } this.dict = xHashSet; } // find outliers for one partition //simple 2d distance //testdata rowid:{tar, tarcolor} public String getOutliers(HashMap<String,String[]> testdata,double[] meanVector,double Max,HashSet<String> dic) { String Id = ""; for(String key: testdata.keySet()) { String[] vpair = testdata.get(key); FeatureVector fvFeatureVector = new FeatureVector(dic); Vector<RecFeature> vRecFeatures = fvFeatureVector.createVector(vpair[0], vpair[1]); double[] x = new double[fvFeatureVector.size()]; for(int i = 0; i< vRecFeatures.size(); i++) { x[i] = vRecFeatures.get(i).computerScore(); } double value = this.getDistance(x, meanVector); /*System.out.println("current: "+ vpair[0]+ " "+Max); System.out.println("=======\n"+this.test(x)+"\n"+this.test(meanVector)); System.out.println("distance: "+value);*/ if(value > Max) { Max = value; this.currentMax = Max; Id = key; } } return Id; } // pid: [{rawstring, code}] public void buildMeanVector(HashMap<String,Vector<String[]>> data,HashSet<String> dict) { if(data == null) return; for(String key:data.keySet()) { Vector<String[]> vs = data.get(key); FeatureVector fVector = new FeatureVector(dict); double[] dvec = new double[fVector.size()]; for (int i = 0; i < dvec.length; i++) { dvec[i] = 0; } for(String[] elem:vs) { Vector<RecFeature> sFeatures = fVector.createVector(elem[0], elem[1]); for(int j = 0; j<sFeatures.size();j++) { dvec[j] += sFeatures.get(j).computerScore(); } } //get average size for(int i = 0; i<dvec.length; i++) { dvec[i] = dvec[i]*1.0/ vs.size(); } rVectors.put(key, dvec); } } public String test(double[] row) { String string=""; for(double d:row) { string += d+","; } return string.substring(1,string.length()-1); } }