/* Copyright 2004, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text.learn; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Set; import edu.cmu.minorthird.text.AbstractAnnotator; import edu.cmu.minorthird.text.FancyLoader; import edu.cmu.minorthird.text.MonotonicTextLabels; import edu.cmu.minorthird.text.MutableTextLabels; import edu.cmu.minorthird.text.RegexTokenizer; import edu.cmu.minorthird.text.Span; import edu.cmu.minorthird.text.SpanDifference; import edu.cmu.minorthird.text.TextLabels; import edu.cmu.minorthird.text.TextLabelsLoader; import edu.cmu.minorthird.text.gui.TextBaseViewer; import edu.cmu.minorthird.text.mixup.MixupInterpreter; import edu.cmu.minorthird.text.mixup.MixupProgram; import edu.cmu.minorthird.util.IOUtil; /** * A name matching scheme on top of a given extractor, fit for spanTypes * depicting personal names. This class applies a given annotator. Then, it uses * the output extractor's dictionary of predicted names and over-rides some of * the original predictions, using the NameMatcher scheme. This procedure * increases recall, at low cost of precision. * * @author Richard Wang, edited by Einat Minkov */ // need to store the names lists in a sorted list (so the name would be matched // from long to short) public class NameMatcher extends AbstractAnnotator{ private String predType="_prediction"; private String spanType="true_name"; private static double threshold=16; private List<String> nameDict=new ArrayList<String>(); private static final String DIV="@#!"; private static final int WINDOW_SIZE=5; private static final int SIG_SIZE=2; // number of tokens at the end of e-mail // in search for signatures private List<String> lowRiskNameList=new ArrayList<String>(); private List<String> highRiskNameList=new ArrayList<String>(); private List<String> deletedNameList=new ArrayList<String>(); private static MonotonicTextLabels postLabels=null; private static boolean Remove_Single_Tokens_Low_PFIDF=true; public NameMatcher(String spanType){ this.spanType=spanType; } public NameMatcher(){ ; } public String getSpanType(){ return spanType; } public void setSpanType(String spanType){ this.spanType=spanType; } @Override protected void doAnnotate(MonotonicTextLabels labels){ // create dictionary, sorted by names' length Set<String> allNames=new HashSet<String>(); for(Iterator<Span> it=labels.instanceIterator(predType);it.hasNext();){ Span sp=it.next(); allNames.add(sp.asString()); } nameDict=new ArrayList<String>(allNames); Collections.sort(nameDict,new Comparator<String>(){ @Override public int compare(String o1,String o2){ return new Integer(o2.length()).compareTo(new Integer(o1.length())); } }); FreqAnal fa=new FreqAnal(labels,predType); // transorm-extend dictionary per pre-defined personal name-specific // templates. // identify 'high-risk' names and eliminate them from the extended // dictionary. transformDict(fa); int counter=0; /** * System.out.println("High Confidence Names:"); for (Iterator i = * nameList.iterator(); i.hasNext();) System.out.println(++counter + ". " + * i.next()); counter = 0; */ System.out.println("Low Risk Names:"); for(Iterator<String> i=lowRiskNameList.iterator();i.hasNext();) System.out.println(++counter+". "+i.next()); counter=0; System.out.println("High Risk Names:"); for(Iterator<String> i=highRiskNameList.iterator();i.hasNext();) System.out.println(++counter+". "+i.next()); counter=0; System.out.println("Deleted Names:"); for(Iterator<String> i=deletedNameList.iterator();i.hasNext();) System.out.println(++counter+". "+i.next()); applyDictIncreaseRecall(labels); if(Remove_Single_Tokens_Low_PFIDF) applyDictIncreasePrecision(postLabels); } @Override public String explainAnnotation(TextLabels labels,Span span){ return "No explanation implemented."; } private void applyDictIncreaseRecall(MonotonicTextLabels labels){ int counter=0; for(Iterator<Span> i=labels.getTextBase().documentSpanIterator();i .hasNext();){ Span docSpan=i.next(); System.out.println(((float)++counter/labels.getTextBase().size()*100)+ "% Working on "+docSpan.getDocumentId()+"..."); for(int j=0;j<docSpan.size();j++){ Span tokenWindow= docSpan.subSpan(j,Math.min(docSpan.size()-j,WINDOW_SIZE)); Span nameMatch=dictLookup(lowRiskNameList,tokenWindow); if(nameMatch!=null){ System.out.println("! Found: "+ nameMatch.asString().replaceAll("[\r\n\\s]+"," ")+" matches "+ tokenWindow.asString().replaceAll("[\r\n\\s]+"," ")); labels.addToType(nameMatch,predType+"_updated"); j+=nameMatch.size()-1; } } // for signature detection for(int j=docSpan.size()-SIG_SIZE;j<docSpan.size();j++){ Span tokenWindow= docSpan.subSpan(j,Math.min(docSpan.size()-j,WINDOW_SIZE)); Span nameMatch=dictLookup(highRiskNameList,tokenWindow); if(nameMatch!=null){ System.out.println("! Found: "+ nameMatch.asString().replaceAll("[\r\n\\s]+"," ")+" matches "+ tokenWindow.asString().replaceAll("[\r\n\\s]+"," ")); labels.addToType(nameMatch,predType+"_updated"); j+=nameMatch.size()-1; } } } NameMatcher.postLabels=labels; } private void applyDictIncreasePrecision(MonotonicTextLabels labels){ int counter=0; for(Iterator<Span> i=labels.getTextBase().documentSpanIterator();i.hasNext();){ Span docSpan=i.next(); System.out.println(((float)++counter/labels.getTextBase().size()*100)+ "% Working on "+docSpan.getDocumentId()+"..."); for(Iterator<Span> k= labels.instanceIterator(predType,docSpan.getDocumentId());k.hasNext();){ Span span=k.next(); if(span.size()==1){ String token=span.getToken(0).getValue().toLowerCase(); if(deletedNameList.contains(token)){ labels.setProperty(span.getToken(0),"delete","t"); } } } } NameMatcher.postLabels=labels; } private Span dictLookup(List<String> nameList,Span tokenWindow){ RegexTokenizer tokenizer=new RegexTokenizer(); for(Iterator<String> i=nameList.iterator();i.hasNext();){ String name=i.next(); String tokens=tokenWindow.asString().replaceAll("[\r\n\\s]+"," "); if(tokens.toLowerCase().matches("(?i)(?s)^\\Q"+name+"\\E(\\W|$).*")){ int numTokens=tokenizer.splitIntoTokens(name).length; return tokenWindow.subSpan(0,numTokens); } } return null; } private void transformDict(FreqAnal fa){ for(Iterator<String> i=nameDict.iterator();i.hasNext();){ List<String> transformedNames=transformName(i.next()); for(Iterator<String> j=transformedNames.iterator();j.hasNext();){ String tn=j.next(); boolean lowRisk=(tn.indexOf(DIV)==-1); boolean highRisk=(tn.matches("(\\w"+DIV+")+")); tn=tn.replaceAll(DIV,""); Double hScore=fa.getHScore(tn); if(hScore!=null&&hScore.doubleValue()<threshold){ deletedNameList.add(tn); continue; } if(lowRisk) lowRiskNameList.add(tn); else if(highRisk) highRiskNameList.add(tn); } } lowRiskNameList=uniqueSortedList(lowRiskNameList); highRiskNameList=uniqueSortedList(highRiskNameList); deletedNameList=uniqueSortedList(deletedNameList); } private List<String> transformName(String name){ List<String> result=new ArrayList<String>(); String str=name.toLowerCase().trim().replaceAll("[^a-zA-Z\\- ]+",""); // if (str.trim().replaceAll("\\W", "").length() > 1) result.add(str); String s[]=str.split("[\\- ]+"); Object[] array=new Object[0]; if(s.length==1){ int[][] order={{0}}; array=transform(s,order); }else if(s.length==2){ int[][] order={{0,1},{0}}; array=transform(s,order); }else if(s.length==3){ int[][] order={{0,1,2},{0,2},{2},{0}}; array=transform(s,order); }else if(s.length==4){ int[][] order={{0,1,2,3},{0,1,3},{0,3},{3},{0}}; array=transform(s,order); } for(int i=0;i<array.length;i++){ String temp=((String)array[i]).trim(); if(temp.replaceAll("\\W","").length()<2) continue; if(temp.matches(".*-$")) continue; result.add(temp); } return result; } private Object[] transform(String[] s,int[][] order){ List<Object> result=new ArrayList<Object>(); Object[][] o=new Object[s.length][]; for(int i=0;i<s.length;i++) o[i]=transformToken(s[i],(i==0),(i==s.length-1)); for(int i=0;i<order.length;i++){ int[] cur_order=order[i]; if(cur_order.length==1) for(int j=0;j<o[cur_order[0]].length;j++) result.add(o[cur_order[0]][j]); else if(cur_order.length==2) for(int j=0;j<o[cur_order[0]].length;j++) for(int k=0;k<o[cur_order[1]].length;k++) result.add((String)o[cur_order[0]][j]+o[cur_order[1]][k]); else if(cur_order.length==3) for(int j=0;j<o[cur_order[0]].length;j++) for(int k=0;k<o[cur_order[1]].length;k++) for(int l=0;l<o[cur_order[2]].length;l++) result.add((String)o[cur_order[0]][j]+o[cur_order[1]][k]+ o[cur_order[2]][l]); else if(cur_order.length==4) for(int j=0;j<o[cur_order[0]].length;j++) for(int k=0;k<o[cur_order[1]].length;k++) for(int l=0;l<o[cur_order[2]].length;l++) for(int m=0;m<o[cur_order[3]].length;m++) result.add((String)o[cur_order[0]][j]+o[cur_order[1]][k]+ o[cur_order[2]][l]+o[cur_order[3]][m]); } return result.toArray(); } private List<String> uniqueSortedList(List<String> list){ List<String> al=new ArrayList<String>(new HashSet<String>(list)); Collections.sort(al,new Comparator<String>(){ @Override public int compare(String o1,String o2){ return new Integer(o2.length()).compareTo(new Integer(o1.length())); } }); return al; } private Object[] transformToken(String name,boolean first,boolean last){ List<String> result=new ArrayList<String>(); if(name.length()==0) return result.toArray(); if(last) result.add(name); if(!last) result.add(name+" "); if(!last) result.add(name+"-"); if(!last) result.add(name.substring(0,1)+". "); if(last) result.add(name.substring(0,1)+"."); result.add(name.substring(0,1)+DIV); return result.toArray(); } private static void usage(){ System.err .println("ExtractorNameMatcher: increase recall of a previously-learned extractor, "); System.err.println("applying a name matching scheme"); System.err.println("Parameters:"); System.err .println(" -loadFrom FILE where to load a previously-learner extractor from"); System.err .println(" -labels KEY the key for the labels, in which names are to be extracted"); System.err .println(" -spanType String the span type of the true names. Usually, it is 'true_name'"); System.err .println(" [-saveAs FILE] a file to save the new post-name matching labels"); System.err.println(""); System.exit(1); } public static void main(String[] args) throws IOException{ File fromFile=null; File saveAs=new File("NM_labels.env"); String spanType=""; MonotonicTextLabels textLabels=null; MonotonicTextLabels annLabels=null; ExtractorAnnotator ann=null; NameMatcher nameMatcher=new NameMatcher(spanType); // parse and load arguments for(int i=0;i<args.length;i++){ if(args[i].equals("-loadFrom")){ fromFile=new File(args[i+1]); }else if(args[i].equals("-saveAs")){ saveAs=new File(args[i+1]); }else if(args[i].equals("-labels")){ textLabels=(MutableTextLabels)FancyLoader.loadTextLabels(args[i+1]); }else if(args[i].equals("-spanType")){ spanType=args[i+1]; } } if((fromFile==null)||(textLabels==null)||(spanType==null)) usage(); // load the annotator try{ ann=(ExtractorAnnotator)IOUtil.loadSerialized(fromFile); }catch(IOException ex){ throw new IllegalArgumentException("can't load annotator from "+fromFile+ ": "+ex); } annLabels=(MonotonicTextLabels)ann.annotatedCopy(textLabels); // TextBaseViewer.view(annLabels); nameMatcher.doAnnotate(annLabels); MixupProgram p=null; try{ p= new MixupProgram( new String[]{"defTokenProp email:t = ~re'([\\.\\-\\w+]+\\@[\\.\\-\\w\\+]+)',1;"}); p.addStatement("defSpanType email =: ... [email:t+R] ... ;"); p .addStatement("defTokenProp predicted_name:1 =: ... [@_prediction_updated] ... || ... [@_prediction] ... ;"); p .addStatement("defSpanType _prediction_updated_fixed =: ... [L <predicted_name:1, !email:t, !delete:t>+ R] ... ;"); }catch(Exception e){ System.out.println(e); } MixupInterpreter interp=new MixupInterpreter(p); interp.eval(postLabels); TextBaseViewer.view(postLabels); if(saveAs!=null){ try{ (new TextLabelsLoader()).saveTypesAsOps(postLabels,saveAs); }catch(IOException e){ try{ (new TextLabelsLoader()).saveTypesAsOps(postLabels,new File( "name-matching-labels.env")); }catch(Exception e2){ System.out.println(e2); } } } // TextBaseViewer.view(nameMatcher.postLabels); SpanDifference sd; System.out .println("============================================================"); System.out.println("Pre names-matching:"); sd= new SpanDifference(NameMatcher.postLabels .instanceIterator(nameMatcher.predType),NameMatcher.postLabels .instanceIterator(spanType),NameMatcher.postLabels .closureIterator(spanType)); System.out.println(sd.toSummary()); System.out.println("Post names-matching:"); SpanDifference finalSD= new SpanDifference(NameMatcher.postLabels .instanceIterator(nameMatcher.predType+"_updated_fixed"), NameMatcher.postLabels.instanceIterator(spanType), NameMatcher.postLabels.closureIterator(spanType)); System.out.println(finalSD.toSummary()); } }