/* Copyright 2004, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text.learn; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import edu.cmu.minorthird.text.FancyLoader; import edu.cmu.minorthird.text.MonotonicTextLabels; import edu.cmu.minorthird.text.MutableTextLabels; import edu.cmu.minorthird.text.RegexTokenizer; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.SpanDifference; import edu.cmu.minorthird.text.TextLabelsLoader; import edu.cmu.minorthird.text.mixup.MixupInterpreter; import edu.cmu.minorthird.text.mixup.MixupProgram; import edu.cmu.minorthird.util.BasicCommandLineProcessor; import edu.cmu.minorthird.util.CommandLineProcessor; import edu.cmu.minorthird.util.IOUtil; /** * A name matching scheme on top of a given extractor, fit for spanTypes * depicting personal names. This class applies a given annotator. Then, it uses * the output extractor's dictionary of predicted names and over-rides some of * the original predictions, using the NameMatcher scheme. This procedure * increases recall, at low cost of precision. * * @author Richard Wang, edited by Einat Minkov */ // need to store the names lists in a sorted list (so the name would be matched // from long to short) public class ExtractorNameMatcher{ private File fromFile=null; private File saveAs=null; private MonotonicTextLabels textLabels=null; private MonotonicTextLabels annLabels=null; private String predType="_prediction"; private String spanType=""; private static double threshold=16; private ExtractorAnnotator ann=null; private SpanDifference finalSD=null; private List<String> nameDict=new ArrayList<String>(); private static final String DIV="@#!"; private static final int WINDOW_SIZE=5; private static final int SIG_SIZE=2; // number of tokens at the end of e-mail // in search for signatures // private static final File fixMixup=new File("fixEnv.mixup"); private List<String> lowRiskNameList=new ArrayList<String>(); private List<String> highRiskNameList=new ArrayList<String>(); private List<String> deletedNameList=new ArrayList<String>(); public double getTokenPrecision(){ return finalSD.tokenPrecision(); } public double getTokenRecall(){ return finalSD.tokenRecall(); } public ExtractorNameMatcher(MonotonicTextLabels labels){ this.annLabels=labels; } public ExtractorNameMatcher(){} // // command-line processing // public class MyCLP extends BasicCommandLineProcessor{ public void loadFrom(String s){ fromFile=new File(s); } public void saveAs(String s){ saveAs=new File(s); } public void labels(String s){ textLabels=(MutableTextLabels)FancyLoader.loadTextLabels(s); } public void spanType(String s){ spanType=s; } @Override public void usage(){ for(int i=0;i<USAGE.length;i++) System.out.println(USAGE[i]); } } public CommandLineProcessor getCLP(){ return new MyCLP(); } static private final String[] USAGE= { "ExtractorNameMatcher: increase recall of a previously-learned extractor, " +"applying a name matching scheme", "", "Parameters:", " -loadFrom FILE where to load a previously-learner extractor from", " -labels KEY the key for the labels, in which names are to be extracted", " [-spanType String] the span type of the true names. The default is set to true_name", " [-saveAs FILE] a file to save the new post-name matching labels", "",}; public void doMain(){ if(annLabels==null){ if(fromFile==null) throw new IllegalStateException("need to specify -loadFrom"); // load the annotator try{ ann=(ExtractorAnnotator)IOUtil.loadSerialized(fromFile); }catch(IOException ex){ throw new IllegalArgumentException("can't load annotator from "+ fromFile+": "+ex); } annLabels=(MonotonicTextLabels)ann.annotatedCopy(textLabels); } // create dictionary, sorted by names' length Set<String> allNames=new HashSet<String>(); for(Iterator<Span> it=annLabels.instanceIterator(predType);it.hasNext();){ Span sp=it.next(); allNames.add(sp.asString()); } nameDict=new ArrayList<String>(allNames); Collections.sort(nameDict,new Comparator<String>(){ @Override public int compare(String o1,String o2){ return new Integer(o2.length()).compareTo(new Integer(o1.length())); } }); FreqAnal fa=new FreqAnal(annLabels,predType); // transorm-extend dictionary per pre-defined personal name-specific // templates. // identify 'high-risk' names and eliminate them from the extended // dictionary. transformDict(fa); int counter=0; /** * System.out.println("High Confidence Names:"); for (Iterator i = * nameList.iterator(); i.hasNext();) System.out.println(++counter + ". " + * i.next()); counter = 0; */ System.out.println("Low Risk Names:"); for(Iterator<String> i=lowRiskNameList.iterator();i.hasNext();) System.out.println(++counter+". "+i.next()); counter=0; System.out.println("High Risk Names:"); for(Iterator<String> i=highRiskNameList.iterator();i.hasNext();) System.out.println(++counter+". "+i.next()); counter=0; System.out.println("Deleted Names:"); for(Iterator<String> i=deletedNameList.iterator();i.hasNext();) System.out.println(++counter+". "+i.next()); applyDict(); MixupProgram p=null; try{ // BUG: THIS FILE DOES NOT EXIST AND THE WAY ITS ACCESSED IS WRONG p=new MixupProgram(new File("c:\\minorthird\\apps\\names\\fixEnv.mixup")); }catch(Exception e){ System.out.println(e); } MixupInterpreter interp=new MixupInterpreter(p); interp.eval(annLabels); if(saveAs!=null){ try{ (new TextLabelsLoader()).saveTypesAsOps(annLabels,saveAs); }catch(IOException e){ try{ (new TextLabelsLoader()).saveTypesAsOps(annLabels,new File( "name-matching-labels.env")); }catch(Exception e2){ System.out.println(e2); } } } // TextBaseViewer.view(annLabels); SpanDifference sd; System.out .println("============================================================"); System.out.println("Pre names-matching:"); sd= new SpanDifference(annLabels.instanceIterator(predType),annLabels .instanceIterator(spanType),annLabels.closureIterator(spanType)); System.out.println(sd.toSummary()); System.out.println("Post names-matching:"); finalSD= new SpanDifference(annLabels .instanceIterator(predType+"_updated_fixed"),annLabels .instanceIterator(spanType),annLabels.closureIterator(spanType)); System.out.println(finalSD.toSummary()); } private void applyDict(){ int counter=0; for(Iterator<Span> i=annLabels.getTextBase().documentSpanIterator();i .hasNext();){ // if (counter==5) TextBaseViewer.view(annLabels); Span docSpan=i.next(); System.out.println(((float)++counter/annLabels.getTextBase().size()*100)+ "% Working on "+docSpan.getDocumentId()+"..."); for(int j=0;j<docSpan.size();j++){ Span tokenWindow= docSpan.subSpan(j,Math.min(docSpan.size()-j,WINDOW_SIZE)); Span nameMatch=dictLookup(lowRiskNameList,tokenWindow); if(nameMatch!=null){ System.out.println("! Found: "+ nameMatch.asString().replaceAll("[\r\n\\s]+"," ")+" matches "+ tokenWindow.asString().replaceAll("[\r\n\\s]+"," ")); annLabels.addToType(nameMatch,predType+"_updated"); j+=nameMatch.size()-1; } } // for signature detection for(int j=docSpan.size()-SIG_SIZE;j<docSpan.size();j++){ Span tokenWindow= docSpan.subSpan(j,Math.min(docSpan.size()-j,WINDOW_SIZE)); Span nameMatch=dictLookup(highRiskNameList,tokenWindow); if(nameMatch!=null){ System.out.println("! Found: "+ nameMatch.asString().replaceAll("[\r\n\\s]+"," ")+" matches "+ tokenWindow.asString().replaceAll("[\r\n\\s]+"," ")); annLabels.addToType(nameMatch,predType+"_updated"); j+=nameMatch.size()-1; } } } } private Span dictLookup(List<String> nameList,Span tokenWindow){ // old code created a BasicTextBase() and called splitIntoTokens(name) RegexTokenizer tokenizer=new RegexTokenizer(); for(Iterator<String> i=nameList.iterator();i.hasNext();){ String name=i.next(); String tokens=tokenWindow.asString().replaceAll("[\r\n\\s]+"," "); if(tokens.toLowerCase().matches("(?i)(?s)^\\Q"+name+"\\E(\\W|$).*")){ int numTokens=tokenizer.splitIntoTokens(name).length; return tokenWindow.subSpan(0,numTokens); } } return null; } private void transformDict(FreqAnal fa){ for(Iterator<String> i=nameDict.iterator();i.hasNext();){ List<String> transformedNames=transformName(i.next()); for(Iterator<String> j=transformedNames.iterator();j.hasNext();){ String tn=j.next(); boolean lowRisk=(tn.indexOf(DIV)==-1); boolean highRisk=(tn.matches("(\\w"+DIV+")+")); tn=tn.replaceAll(DIV,""); Double hScore=fa.getHScore(tn); if(hScore!=null&&hScore.doubleValue()<threshold){ deletedNameList.add(tn); continue; } if(lowRisk) lowRiskNameList.add(tn); else if(highRisk) highRiskNameList.add(tn); } } lowRiskNameList=uniqueSortedList(lowRiskNameList); highRiskNameList=uniqueSortedList(highRiskNameList); deletedNameList=uniqueSortedList(deletedNameList); } private List<String> transformName(String name){ List<String> result=new ArrayList<String>(); String str=name.toLowerCase().trim().replaceAll("[^a-zA-Z\\- ]+",""); // if (str.trim().replaceAll("\\W", "").length() > 1) result.add(str); String s[]=str.split("[\\- ]+"); Object[] array=new Object[0]; if(s.length==1){ int[][] order={{0}}; array=transform(s,order); }else if(s.length==2){ int[][] order={{0,1},{0}}; array=transform(s,order); }else if(s.length==3){ int[][] order={{0,1,2},{0,2},{2},{0}}; array=transform(s,order); }else if(s.length==4){ int[][] order={{0,1,2,3},{0,1,3},{0,3},{3},{0}}; array=transform(s,order); } for(int i=0;i<array.length;i++){ String temp=((String)array[i]).trim(); if(temp.replaceAll("\\W","").length()<2) continue; if(temp.matches(".*-$")) continue; result.add(temp); } return result; } private Object[] transform(String[] s,int[][] order){ List<Object> result=new ArrayList<Object>(); Object[][] o=new Object[s.length][]; for(int i=0;i<s.length;i++) o[i]=transformToken(s[i],(i==0),(i==s.length-1)); for(int i=0;i<order.length;i++){ int[] cur_order=order[i]; if(cur_order.length==1) for(int j=0;j<o[cur_order[0]].length;j++) result.add(o[cur_order[0]][j]); else if(cur_order.length==2) for(int j=0;j<o[cur_order[0]].length;j++) for(int k=0;k<o[cur_order[1]].length;k++) result.add((String)o[cur_order[0]][j]+o[cur_order[1]][k]); else if(cur_order.length==3) for(int j=0;j<o[cur_order[0]].length;j++) for(int k=0;k<o[cur_order[1]].length;k++) for(int l=0;l<o[cur_order[2]].length;l++) result.add((String)o[cur_order[0]][j]+o[cur_order[1]][k]+ o[cur_order[2]][l]); else if(cur_order.length==4) for(int j=0;j<o[cur_order[0]].length;j++) for(int k=0;k<o[cur_order[1]].length;k++) for(int l=0;l<o[cur_order[2]].length;l++) for(int m=0;m<o[cur_order[3]].length;m++) result.add((String)o[cur_order[0]][j]+o[cur_order[1]][k]+ o[cur_order[2]][l]+o[cur_order[3]][m]); } return result.toArray(); } private List<String> uniqueSortedList(List<String> list){ Set<String> set=new HashSet<String>(); for(Iterator<String> i=list.iterator();i.hasNext();){ String str=i.next(); set.add(str); } List<String> al=new ArrayList<String>(set); Collections.sort(al,new Comparator<String>(){ @Override public int compare(String o1,String o2){ return new Integer(o2.length()).compareTo(o1.length()); } }); return al; } private Object[] transformToken(String name,boolean first,boolean last){ List<String> result=new ArrayList<String>(); if(name.length()==0) return result.toArray(); if(last) result.add(name); if(!last) result.add(name+" "); if(!last) result.add(name+"-"); if(!last) result.add(name.substring(0,1)+". "); if(last) result.add(name.substring(0,1)+"."); result.add(name.substring(0,1)+DIV); return result.toArray(); } /** */ public static void main(String[] args){ try{ ExtractorNameMatcher nm=new ExtractorNameMatcher(); nm.getCLP().processArguments(args); nm.doMain(); }catch(Exception ex){ ex.printStackTrace(); } } }