package edu.cmu.minorthird.text; import java.io.BufferedReader; import java.io.File; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.io.Serializable; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.SortedSet; import java.util.TreeMap; import java.util.TreeSet; import org.apache.log4j.Logger; import edu.cmu.minorthird.text.gui.ZoomingTextLabelsViewer; import edu.cmu.minorthird.util.Saveable; import edu.cmu.minorthird.util.gui.Viewer; import edu.cmu.minorthird.util.gui.Visible; /** * Maintains assertions about 'types' and 'properties' of contiguous Spans of * these TextToken's. * * @author William Cohen */ public class BasicTextLabels implements MutableTextLabels,Serializable,Visible, Saveable{ static private final long serialVersionUID=20080303L; private static Logger log=Logger.getLogger(BasicTextLabels.class); private Map<Token,SortedMap<String,String>> textTokenPropertyMap=new HashMap<Token,SortedMap<String,String>>(); private Set<String> textTokenPropertySet=new HashSet<String>(); private Map<Span,SortedMap<String,String>> spanPropertyMap=new HashMap<Span,SortedMap<String,String>>(); private Map<String,SortedSet<Span>> spansWithSomePropertyByDocId=new HashMap<String,SortedSet<Span>>(); private Set<String> spanPropertySet=new HashSet<String>(); private Map<String,SortedMap<String,SortedSet<Span>>> typeDocumentSetMap=new TreeMap<String,SortedMap<String,SortedSet<Span>>>(); private Map<String,SortedMap<String,SortedSet<Span>>> closureDocumentSetMap=new HashMap<String,SortedMap<String,SortedSet<Span>>>(); private Map<String,Set<String>> textTokenDictMap=new HashMap<String,Set<String>>(); private Set<String> annotatedBySet=new HashSet<String>(); private Map<ObjectStringKey<?>,Details> detailMap=new TreeMap<ObjectStringKey<?>,Details>(); private AnnotatorLoader loader=new DefaultAnnotatorLoader(); // for statementType = TRIE public Trie trie=null; // don't serialize this, it's too big! transient private TextBase textBase=null; /** Creates an empty TextLabels not associated with a TextBase */ public BasicTextLabels(){ this.textBase=null; } /** Creates an empty TextLabels associated with the specified TextBase */ public BasicTextLabels(TextBase textBase){ this.textBase=textBase; } /** * Returns the TextBase associated with this labels set or NULL if it has not * been set. */ @Override public TextBase getTextBase(){ return textBase; } /** Returns whether this labels set knows about the specified dictionary */ @Override public boolean hasDictionary(String dictionary){ return textTokenDictMap.containsKey(dictionary); } /** * Sets the TextBase associated with this labels set. * * @throws java.lang.IllegalStateException * If the TextBase has already been set. */ @Override public void setTextBase(TextBase textBase){ if(this.textBase!=null) throw new IllegalStateException("textBase already set"); this.textBase=textBase; } /** A convenience method which creates empty labels containing a single string. */ public BasicTextLabels(String s){ this(new BasicTextBase()); ((BasicTextBase)getTextBase()).loadDocument("nullId",s); } // // methods used to maintain annotation history // /** * Returns whether or not this labels set has been annotated to include the * specified type. */ @Override public boolean isAnnotatedBy(String s){ return annotatedBySet.contains(s); } /** * Adds the specified type to the list of annotation types that this labels * set has been annotated to contain. */ @Override public void setAnnotatedBy(String s){ annotatedBySet.add(s); } /** Sets the loader used to locate annotators. */ @Override public void setAnnotatorLoader(AnnotatorLoader newLoader){ this.loader=newLoader; } /** Returns the current loader used to locate annotators. */ @Override public AnnotatorLoader getAnnotatorLoader(){ return loader; } @Override public void require(String annotationType,String fileToLoad){ require(annotationType,fileToLoad,loader); } @Override public void require(String annotationType,String fileToLoad,AnnotatorLoader theLoader){ doRequire(this,annotationType,fileToLoad,theLoader); } static public void doRequire(MonotonicTextLabels labels,String annotationType,String fileToLoad,AnnotatorLoader theLoader){ // only annotate if not already done if(annotationType!=null&&!labels.isAnnotatedBy(annotationType)){ if(theLoader==null){ // use current loader as default theLoader=labels.getAnnotatorLoader(); } log.info("Trying load \""+annotationType+"\" from "+fileToLoad+" using "+theLoader); Annotator annotator=theLoader.findAnnotator(annotationType,fileToLoad); log.info("Loaded "+annotator); if(annotator==null){ throw new IllegalArgumentException("Cannot find annotator "+annotationType+" (file: "+fileToLoad+")"); } // annotate using theLoader for any recursively-required annotations, AnnotatorLoader savedLoader=labels.getAnnotatorLoader(); labels.setAnnotatorLoader(theLoader); annotator.annotate(labels); labels.setAnnotatorLoader(savedLoader); // restore original loader // check that the annotationType is provided if(!labels.isAnnotatedBy(annotationType)){ throw new IllegalStateException(annotator+" did not provide annotation type: "+annotationType); } } } @Override public void annotateWith(String annotationType,String fileToLoad){ annotateWith(this,annotationType,fileToLoad); } static public void annotateWith(MonotonicTextLabels labels, String annotationType,String fileToLoad){ AnnotatorLoader theLoader=labels.getAnnotatorLoader(); Annotator annotator=theLoader.findAnnotator(annotationType,fileToLoad); annotator.annotate(labels); } // // maintain dictionaries // /** Returns true if the value of the Token is in the named dictionary. */ @Override public boolean inDict(Token token,String dictName){ if(token.getValue()==null) throw new IllegalArgumentException("null token.value?"); Set<String> set=textTokenDictMap.get(dictName); if(set==null) throw new IllegalArgumentException("undefined dictionary "+dictName); return set.contains(token.getValue()); } /** Associate a dictionary with this labeling. */ @Override public void defineDictionary(String dictName,Set<String> dictionary){ textTokenDictMap.put(dictName,dictionary); if(log.isDebugEnabled()) log.debug("added to token dictionary: "+dictName+" values "+textTokenDictMap.get(dictName)); } /** Associate a dictionary from this file */ @Override public void defineDictionary(String dictName,List<String> fileNames, boolean ignoreCase){ Set<String> wordSet=new HashSet<String>(); AnnotatorLoader theLoader=this.getAnnotatorLoader(); // We should use the same tokenizer that the text base associated with this // labels set uses for new docs. // RegexTokenizer tok = new RegexTokenizer(); Tokenizer tok=this.getTextBase().getTokenizer(); String[] currentEntryTokens; for(int i=0;i<fileNames.size();i++){ String fileName=fileNames.get(i); InputStream stream=theLoader.findFileResource(fileName); try{ LineNumberReader bReader= new LineNumberReader(new BufferedReader(new InputStreamReader( stream))); String s=null; while((s=bReader.readLine())!=null){ s=s.trim(); // remove trailing blanks // Split the entry into tokens and add it to the set only if there is // a single token. // Otherwise give an warning and ignore the entry. currentEntryTokens=tok.splitIntoTokens(s); if(currentEntryTokens.length>1){ log .warn("Ignoring entry: \'"+ s+ "\' because it contains more than 1 token. Use a Trie to match against sequences of tokens."); }else{ if(ignoreCase) s=s.toLowerCase(); wordSet.add(s); } } bReader.close(); }catch(IOException ioe){ // parseError("Error when reading " + fileName.toString() + ": " + ioe); ioe.printStackTrace(); } } defineDictionary(dictName,wordSet); } /** Return a trie if defined */ @Override public Trie getTrie(){ return trie; } /** Define a trie */ @Override public void defineTrie(List<String> phraseList){ trie=new Trie(); // We should use the same tokenizer that the text base associated with this // labels set uses for new docs. // RegexTokenizer tokenizer = new RegexTokenizer(); Tokenizer tokenizer=this.getTextBase().getTokenizer(); for(int i=0;i<phraseList.size();i++){ String[] toks=tokenizer.splitIntoTokens(phraseList.get(i)); if(toks.length<=2||!"\"".equals(toks[0])|| !"\"".equals(toks[toks.length-1])){ trie.addWords("phrase#"+i,toks); }else{ StringBuffer defFile=new StringBuffer(""); for(int j=1;j<toks.length-1;j++){ defFile.append(toks[j]); } AnnotatorLoader theLoader=this.getAnnotatorLoader(); InputStream stream=theLoader.findFileResource(defFile.toString()); try{ LineNumberReader bReader= new LineNumberReader(new BufferedReader(new InputStreamReader( stream))); String s=null; int line=0; while((s=bReader.readLine())!=null){ line++; String[] words=tokenizer.splitIntoTokens(s); trie.addWords(defFile+".line."+line,words); } bReader.close(); }catch(IOException ioe){ // parseError("Error when reading " + defFile.toString() + ": " + // ioe); ioe.printStackTrace(); } } // file load } // each phrase } // // maintain assertions about properties of Tokens // /** Get the property value associated with this Token. */ @Override public String getProperty(Token token,String prop){ return getPropMap(token).get(prop); } /** Get a set of all properties. */ @Override public Set<String> getTokenProperties(){ return textTokenPropertySet; } /** Assert that Token textToken has the given value of the given property */ @Override public void setProperty(Token textToken,String prop,String value){ getPropMap(textToken).put(prop,value); textTokenPropertySet.add(prop); } /** * Assert that Token textToken has the given value of the given property, and * associate that with some detailed information */ @Override public void setProperty(Token textToken,String prop,String value, Details details){ setProperty(textToken,prop,value); if(details!=null){ detailMap.put(new TokenPropKey(textToken,prop),details); } } private SortedMap<String,String> getPropMap(Token textToken){ SortedMap<String,String> map=textTokenPropertyMap.get(textToken); if(map==null){ map=new TreeMap<String,String>(); textTokenPropertyMap.put(textToken,map); } return map; } // // maintain assertions about properties of spans // /** Get the property value associated with this Span. */ @Override public String getProperty(Span span,String prop){ return getPropMap(span).get(prop); } /** Get a set of all properties. */ @Override public Set<String> getSpanProperties(){ return spanPropertySet; } /** Find all spans that have a non-null value for this property. */ @Override public Iterator<Span> getSpansWithProperty(String prop){ SortedSet<Span> accum=new TreeSet<Span>(); for(Iterator<Span> i=spanPropertyMap.keySet().iterator();i.hasNext();){ Span s=i.next(); if(getProperty(s,prop)!=null){ accum.add(s); } } return accum.iterator(); } /** Find all spans that have a non-null value for this property. */ @Override public Iterator<Span> getSpansWithProperty(String prop,String id){ SortedSet<Span> set=spansWithSomePropertyByDocId.get(id); if(set==null) return Collections.EMPTY_SET.iterator(); else{ SortedSet<Span> accum=new TreeSet<Span>(); for(Iterator<Span> i=set.iterator();i.hasNext();){ Span s=i.next(); if(getProperty(s,prop)!=null){ accum.add(s); } } return accum.iterator(); } } /** Assert that Span span has the given value of the given property */ @Override public void setProperty(Span span,String prop,String value){ getPropMap(span).put(prop,value); spanPropertySet.add(prop); SortedSet<Span> set=spansWithSomePropertyByDocId.get(span.getDocumentId()); if(set==null) spansWithSomePropertyByDocId .put(span.getDocumentId(),(set=new TreeSet<Span>())); set.add(span); } @Override public void setProperty(Span span,String prop,String value,Details details){ setProperty(span,prop,value); if(details!=null){ detailMap.put(new SpanPropKey(span,prop),details); } } private SortedMap<String,String> getPropMap(Span span){ SortedMap<String,String> map=spanPropertyMap.get(span); if(map==null){ map=new TreeMap<String,String>(); spanPropertyMap.put(span,map); } return map; } // // maintain assertions about types of Spans // @Override public boolean hasType(Span span,String type){ return getTypeSet(type,span.getDocumentId()).contains(span); } @Override public void addToType(Span span,String type){ if(type==null) throw new IllegalArgumentException("null type added"); lookupTypeSet(type,span.getDocumentId()).add(span); } @Override public void addToType(Span span,String type,Details details){ addToType(span,type); if(details!=null){ detailMap.put(new SpanTypeKey(span,type),details); } } @Override public Set<String> getTypes(){ return typeDocumentSetMap.keySet(); } @Override public boolean isType(String type){ return typeDocumentSetMap.get(type)!=null; } @Override public void declareType(String type){ // System.out.println("BasicTextLabels: declareType: "+type); if(type==null) throw new IllegalArgumentException("null type declared"); if(!isType(type)) typeDocumentSetMap.put(type,new TreeMap<String,SortedSet<Span>>()); } @Override public Iterator<Span> instanceIterator(String type){ return new MyNestedSpanLooper(type,false); } @Override public Iterator<Span> instanceIterator(String type,String documentId){ if(documentId!=null) return getTypeSet(type,documentId).iterator(); else return instanceIterator(type); } @Override public void defineTypeInside(String type,Span s,Iterator<Span> i){ if(type==null||s.getDocumentId()==null) throw new IllegalArgumentException("null type defined"); // System.out.println("BTE type: "+type+" documentId: "+s.getDocumentId()); Set<Span> set=lookupTypeSet(type,s.getDocumentId()); // remove all spans currently inside set for(Iterator<Span> j=set.iterator();j.hasNext();){ Span t=j.next(); if(s.contains(t)) j.remove(); } // add spans from i to set while(i.hasNext()) set.add(i.next()); // close the type closeTypeInside(type,s); } @Override public Details getDetails(Span span,String type){ SpanTypeKey key=new SpanTypeKey(span,type); Details details=detailMap.get(key); if(details!=null) return details; else return hasType(span,type)?Details.DEFAULT:null; } // get the set of spans with a given type in the given document // so that it can be modified protected Set<Span> lookupTypeSet(String type,String documentId){ if(type==null||documentId==null) throw new IllegalArgumentException("null type?"); SortedMap<String,SortedSet<Span>> documentsWithType=typeDocumentSetMap.get(type); if(documentsWithType==null){ typeDocumentSetMap.put(type,documentsWithType=new TreeMap<String,SortedSet<Span>>()); } // System.out.println("BTE type: "+type+" documentId: "+documentId+" // documentsWithType:" + documentsWithType); SortedSet<Span> set=documentsWithType.get(documentId); if(set==null){ documentsWithType.put(documentId,(set=new TreeSet<Span>())); } return set; } // get the set of spans with a given type in the given document w/o changing // it @Override public Set<Span> getTypeSet(String type,String documentId){ if(type==null||documentId==null) throw new IllegalArgumentException("null type?"); SortedMap<String,SortedSet<Span>> documentsWithType=typeDocumentSetMap.get(type); if(documentsWithType==null) return Collections.EMPTY_SET; SortedSet<Span> set=documentsWithType.get(documentId); if(set==null) return Collections.EMPTY_SET; return set; } private class ObjectStringKey<T extends Comparable<T>> implements Comparable<ObjectStringKey<T>>{ T obj; String str; public ObjectStringKey(T o,String s){ this.obj=o; this.str=s; } @Override public int compareTo(ObjectStringKey<T> b){ String bn=b.obj.getClass().toString(); int tmp=obj.getClass().toString().compareTo(bn); if(tmp!=0) return tmp; tmp=obj.compareTo(b.obj); if(tmp!=0) return tmp; return str.compareTo(b.str); } } private class SpanTypeKey extends ObjectStringKey<Span>{ public SpanTypeKey(Span span,String type){ super(span,"type:"+type); } } private class SpanPropKey extends ObjectStringKey<Span>{ public SpanPropKey(Span span,String prop){ super(span,"prop:"+prop); } } private class TokenPropKey extends ObjectStringKey<String>{ public TokenPropKey(Token token,String prop){ super(token.getValue(),prop); } } // // maintain assertions about where the closed world assumption holds // @Override public Iterator<Span> closureIterator(String type){ return new MyNestedSpanLooper(type,true); } @Override public Iterator<Span> closureIterator(String type,String documentId){ if(documentId!=null){ return getClosureSet(type,documentId).iterator(); } else{ return closureIterator(type); } } @Override public void closeTypeInside(String type,Span s){ getClosureSet(type,s.getDocumentId()).add(s); } /** * get the set of spans with a given type in the given document */ private Set<Span> getClosureSet(String type,String documentId){ SortedMap<String,SortedSet<Span>> documentsWithClosure=closureDocumentSetMap.get(type); if(documentsWithClosure==null){ closureDocumentSetMap.put(type,documentsWithClosure=new TreeMap<String,SortedSet<Span>>()); //closureDocumentSetMap.put(type,documentsWithClosure=typeDocumentSetMap.get(type)); } SortedSet<Span> set=documentsWithClosure.get(documentId); if(set==null){ documentsWithClosure.put(documentId,set=new TreeSet<Span>()); } return set; } /** iterate over all spans of a given type */ private class MyNestedSpanLooper implements Iterator<Span>{ private Iterator<Map.Entry<String,SortedSet<Span>>> documentIterator; private Iterator<Span> spanIterator; private Span nextSpan; // private int estimatedSize; // private boolean getClosures; // if false, get documents public MyNestedSpanLooper(String type,boolean getClosures){ // System.out.println("building MyNestedSpanLooper for "+type+": // "+typeDocumentSetMap); Map<String,SortedSet<Span>> documentMap=getClosures?closureDocumentSetMap.get(type):typeDocumentSetMap.get(type); if(documentMap==null){ nextSpan=null; // estimatedSize=0; }else{ // iterator over the documents in the map documentIterator=documentMap.entrySet().iterator(); // estimatedSize=documentMap.entrySet().size(); spanIterator=null; advance(); } } /** * @return Number of documents with the given type */ // public int estimatedSize(){ // return estimatedSize; // } @Override public boolean hasNext(){ return nextSpan!=null; } @Override public void remove(){ throw new UnsupportedOperationException("can't remove"); } @Override public Span next(){ Span result=nextSpan; advance(); return result; } // public Span nextSpan(){ // return (Span)next(); // } private void advance(){ if(spanIterator!=null&&spanIterator.hasNext()){ // get next span in the current document nextSpan=spanIterator.next(); }else if(documentIterator.hasNext()){ // move to the next document Map.Entry<String,SortedSet<Span>> entry=documentIterator.next(); spanIterator=entry.getValue().iterator(); advance(); }else{ // nothing found nextSpan=null; } } } @Override public String toString(){ return "[BasicTextLabels "+typeDocumentSetMap+"]"; } /** Dump of all strings that have textTokenuence with the given property */ @Override public String showTokenProp(TextBase base,String prop){ StringBuffer buf=new StringBuffer(); for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){ Span span=i.next(); for(int j=0;j<span.size();j++){ Token textToken=span.getToken(j); if(j>0) buf.append(" "); buf.append(textToken.getValue()); String val=getProperty(textToken,prop); if(val!=null){ buf.append(":"+val); } } buf.append("\n"); } return buf.toString(); } @Override public Viewer toGUI(){ return new ZoomingTextLabelsViewer(this); } // // Implement Saveable interface. // static private final String FORMAT_NAME="Minorthird TextLabels"; @Override public String[] getFormatNames(){ return new String[]{FORMAT_NAME}; } @Override public String getExtensionFor(String s){ return ".labels"; } @Override public void saveAs(File file,String format) throws IOException{ if(!format.equals(FORMAT_NAME)) throw new IllegalArgumentException("illegal format "+format); new TextLabelsLoader().saveTypesAsOps(this,file); } @Override public Object restore(File file) throws IOException{ throw new UnsupportedOperationException("Cannot load TextLabels object"); } }