package edu.cmu.minorthird.text;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.io.Serializable;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import org.apache.log4j.Logger;
import edu.cmu.minorthird.text.gui.ZoomingTextLabelsViewer;
import edu.cmu.minorthird.util.Saveable;
import edu.cmu.minorthird.util.gui.Viewer;
import edu.cmu.minorthird.util.gui.Visible;
/**
* Maintains assertions about 'types' and 'properties' of contiguous Spans of
* these TextToken's.
*
* @author William Cohen
*/
public class BasicTextLabels implements MutableTextLabels,Serializable,Visible,
Saveable{
static private final long serialVersionUID=20080303L;
private static Logger log=Logger.getLogger(BasicTextLabels.class);
private Map<Token,SortedMap<String,String>> textTokenPropertyMap=new HashMap<Token,SortedMap<String,String>>();
private Set<String> textTokenPropertySet=new HashSet<String>();
private Map<Span,SortedMap<String,String>> spanPropertyMap=new HashMap<Span,SortedMap<String,String>>();
private Map<String,SortedSet<Span>> spansWithSomePropertyByDocId=new HashMap<String,SortedSet<Span>>();
private Set<String> spanPropertySet=new HashSet<String>();
private Map<String,SortedMap<String,SortedSet<Span>>> typeDocumentSetMap=new TreeMap<String,SortedMap<String,SortedSet<Span>>>();
private Map<String,SortedMap<String,SortedSet<Span>>> closureDocumentSetMap=new HashMap<String,SortedMap<String,SortedSet<Span>>>();
private Map<String,Set<String>> textTokenDictMap=new HashMap<String,Set<String>>();
private Set<String> annotatedBySet=new HashSet<String>();
private Map<ObjectStringKey<?>,Details> detailMap=new TreeMap<ObjectStringKey<?>,Details>();
private AnnotatorLoader loader=new DefaultAnnotatorLoader();
// for statementType = TRIE
public Trie trie=null;
// don't serialize this, it's too big!
transient private TextBase textBase=null;
/** Creates an empty TextLabels not associated with a TextBase */
public BasicTextLabels(){
this.textBase=null;
}
/** Creates an empty TextLabels associated with the specified TextBase */
public BasicTextLabels(TextBase textBase){
this.textBase=textBase;
}
/**
* Returns the TextBase associated with this labels set or NULL if it has not
* been set.
*/
@Override
public TextBase getTextBase(){
return textBase;
}
/** Returns whether this labels set knows about the specified dictionary */
@Override
public boolean hasDictionary(String dictionary){
return textTokenDictMap.containsKey(dictionary);
}
/**
* Sets the TextBase associated with this labels set.
*
* @throws java.lang.IllegalStateException
* If the TextBase has already been set.
*/
@Override
public void setTextBase(TextBase textBase){
if(this.textBase!=null)
throw new IllegalStateException("textBase already set");
this.textBase=textBase;
}
/** A convenience method which creates empty labels containing a single string. */
public BasicTextLabels(String s){
this(new BasicTextBase());
((BasicTextBase)getTextBase()).loadDocument("nullId",s);
}
//
// methods used to maintain annotation history
//
/**
* Returns whether or not this labels set has been annotated to include the
* specified type.
*/
@Override
public boolean isAnnotatedBy(String s){
return annotatedBySet.contains(s);
}
/**
* Adds the specified type to the list of annotation types that this labels
* set has been annotated to contain.
*/
@Override
public void setAnnotatedBy(String s){
annotatedBySet.add(s);
}
/** Sets the loader used to locate annotators. */
@Override
public void setAnnotatorLoader(AnnotatorLoader newLoader){
this.loader=newLoader;
}
/** Returns the current loader used to locate annotators. */
@Override
public AnnotatorLoader getAnnotatorLoader(){
return loader;
}
@Override
public void require(String annotationType,String fileToLoad){
require(annotationType,fileToLoad,loader);
}
@Override
public void require(String annotationType,String fileToLoad,AnnotatorLoader theLoader){
doRequire(this,annotationType,fileToLoad,theLoader);
}
static public void doRequire(MonotonicTextLabels labels,String annotationType,String fileToLoad,AnnotatorLoader theLoader){
// only annotate if not already done
if(annotationType!=null&&!labels.isAnnotatedBy(annotationType)){
if(theLoader==null){
// use current loader as default
theLoader=labels.getAnnotatorLoader();
}
log.info("Trying load \""+annotationType+"\" from "+fileToLoad+" using "+theLoader);
Annotator annotator=theLoader.findAnnotator(annotationType,fileToLoad);
log.info("Loaded "+annotator);
if(annotator==null){
throw new IllegalArgumentException("Cannot find annotator "+annotationType+" (file: "+fileToLoad+")");
}
// annotate using theLoader for any recursively-required annotations,
AnnotatorLoader savedLoader=labels.getAnnotatorLoader();
labels.setAnnotatorLoader(theLoader);
annotator.annotate(labels);
labels.setAnnotatorLoader(savedLoader); // restore original loader
// check that the annotationType is provided
if(!labels.isAnnotatedBy(annotationType)){
throw new IllegalStateException(annotator+" did not provide annotation type: "+annotationType);
}
}
}
@Override
public void annotateWith(String annotationType,String fileToLoad){
annotateWith(this,annotationType,fileToLoad);
}
static public void annotateWith(MonotonicTextLabels labels,
String annotationType,String fileToLoad){
AnnotatorLoader theLoader=labels.getAnnotatorLoader();
Annotator annotator=theLoader.findAnnotator(annotationType,fileToLoad);
annotator.annotate(labels);
}
//
// maintain dictionaries
//
/** Returns true if the value of the Token is in the named dictionary. */
@Override
public boolean inDict(Token token,String dictName){
if(token.getValue()==null)
throw new IllegalArgumentException("null token.value?");
Set<String> set=textTokenDictMap.get(dictName);
if(set==null)
throw new IllegalArgumentException("undefined dictionary "+dictName);
return set.contains(token.getValue());
}
/** Associate a dictionary with this labeling. */
@Override
public void defineDictionary(String dictName,Set<String> dictionary){
textTokenDictMap.put(dictName,dictionary);
if(log.isDebugEnabled())
log.debug("added to token dictionary: "+dictName+" values "+textTokenDictMap.get(dictName));
}
/** Associate a dictionary from this file */
@Override
public void defineDictionary(String dictName,List<String> fileNames,
boolean ignoreCase){
Set<String> wordSet=new HashSet<String>();
AnnotatorLoader theLoader=this.getAnnotatorLoader();
// We should use the same tokenizer that the text base associated with this
// labels set uses for new docs.
// RegexTokenizer tok = new RegexTokenizer();
Tokenizer tok=this.getTextBase().getTokenizer();
String[] currentEntryTokens;
for(int i=0;i<fileNames.size();i++){
String fileName=fileNames.get(i);
InputStream stream=theLoader.findFileResource(fileName);
try{
LineNumberReader bReader=
new LineNumberReader(new BufferedReader(new InputStreamReader(
stream)));
String s=null;
while((s=bReader.readLine())!=null){
s=s.trim(); // remove trailing blanks
// Split the entry into tokens and add it to the set only if there is
// a single token.
// Otherwise give an warning and ignore the entry.
currentEntryTokens=tok.splitIntoTokens(s);
if(currentEntryTokens.length>1){
log
.warn("Ignoring entry: \'"+
s+
"\' because it contains more than 1 token. Use a Trie to match against sequences of tokens.");
}else{
if(ignoreCase)
s=s.toLowerCase();
wordSet.add(s);
}
}
bReader.close();
}catch(IOException ioe){
// parseError("Error when reading " + fileName.toString() + ": " + ioe);
ioe.printStackTrace();
}
}
defineDictionary(dictName,wordSet);
}
/** Return a trie if defined */
@Override
public Trie getTrie(){
return trie;
}
/** Define a trie */
@Override
public void defineTrie(List<String> phraseList){
trie=new Trie();
// We should use the same tokenizer that the text base associated with this
// labels set uses for new docs.
// RegexTokenizer tokenizer = new RegexTokenizer();
Tokenizer tokenizer=this.getTextBase().getTokenizer();
for(int i=0;i<phraseList.size();i++){
String[] toks=tokenizer.splitIntoTokens(phraseList.get(i));
if(toks.length<=2||!"\"".equals(toks[0])||
!"\"".equals(toks[toks.length-1])){
trie.addWords("phrase#"+i,toks);
}else{
StringBuffer defFile=new StringBuffer("");
for(int j=1;j<toks.length-1;j++){
defFile.append(toks[j]);
}
AnnotatorLoader theLoader=this.getAnnotatorLoader();
InputStream stream=theLoader.findFileResource(defFile.toString());
try{
LineNumberReader bReader=
new LineNumberReader(new BufferedReader(new InputStreamReader(
stream)));
String s=null;
int line=0;
while((s=bReader.readLine())!=null){
line++;
String[] words=tokenizer.splitIntoTokens(s);
trie.addWords(defFile+".line."+line,words);
}
bReader.close();
}catch(IOException ioe){
// parseError("Error when reading " + defFile.toString() + ": " +
// ioe);
ioe.printStackTrace();
}
} // file load
} // each phrase
}
//
// maintain assertions about properties of Tokens
//
/** Get the property value associated with this Token. */
@Override
public String getProperty(Token token,String prop){
return getPropMap(token).get(prop);
}
/** Get a set of all properties. */
@Override
public Set<String> getTokenProperties(){
return textTokenPropertySet;
}
/** Assert that Token textToken has the given value of the given property */
@Override
public void setProperty(Token textToken,String prop,String value){
getPropMap(textToken).put(prop,value);
textTokenPropertySet.add(prop);
}
/**
* Assert that Token textToken has the given value of the given property, and
* associate that with some detailed information
*/
@Override
public void setProperty(Token textToken,String prop,String value,
Details details){
setProperty(textToken,prop,value);
if(details!=null){
detailMap.put(new TokenPropKey(textToken,prop),details);
}
}
private SortedMap<String,String> getPropMap(Token textToken){
SortedMap<String,String> map=textTokenPropertyMap.get(textToken);
if(map==null){
map=new TreeMap<String,String>();
textTokenPropertyMap.put(textToken,map);
}
return map;
}
//
// maintain assertions about properties of spans
//
/** Get the property value associated with this Span. */
@Override
public String getProperty(Span span,String prop){
return getPropMap(span).get(prop);
}
/** Get a set of all properties. */
@Override
public Set<String> getSpanProperties(){
return spanPropertySet;
}
/** Find all spans that have a non-null value for this property. */
@Override
public Iterator<Span> getSpansWithProperty(String prop){
SortedSet<Span> accum=new TreeSet<Span>();
for(Iterator<Span> i=spanPropertyMap.keySet().iterator();i.hasNext();){
Span s=i.next();
if(getProperty(s,prop)!=null){
accum.add(s);
}
}
return accum.iterator();
}
/** Find all spans that have a non-null value for this property. */
@Override
public Iterator<Span> getSpansWithProperty(String prop,String id){
SortedSet<Span> set=spansWithSomePropertyByDocId.get(id);
if(set==null)
return Collections.EMPTY_SET.iterator();
else{
SortedSet<Span> accum=new TreeSet<Span>();
for(Iterator<Span> i=set.iterator();i.hasNext();){
Span s=i.next();
if(getProperty(s,prop)!=null){
accum.add(s);
}
}
return accum.iterator();
}
}
/** Assert that Span span has the given value of the given property */
@Override
public void setProperty(Span span,String prop,String value){
getPropMap(span).put(prop,value);
spanPropertySet.add(prop);
SortedSet<Span> set=spansWithSomePropertyByDocId.get(span.getDocumentId());
if(set==null)
spansWithSomePropertyByDocId
.put(span.getDocumentId(),(set=new TreeSet<Span>()));
set.add(span);
}
@Override
public void setProperty(Span span,String prop,String value,Details details){
setProperty(span,prop,value);
if(details!=null){
detailMap.put(new SpanPropKey(span,prop),details);
}
}
private SortedMap<String,String> getPropMap(Span span){
SortedMap<String,String> map=spanPropertyMap.get(span);
if(map==null){
map=new TreeMap<String,String>();
spanPropertyMap.put(span,map);
}
return map;
}
//
// maintain assertions about types of Spans
//
@Override
public boolean hasType(Span span,String type){
return getTypeSet(type,span.getDocumentId()).contains(span);
}
@Override
public void addToType(Span span,String type){
if(type==null)
throw new IllegalArgumentException("null type added");
lookupTypeSet(type,span.getDocumentId()).add(span);
}
@Override
public void addToType(Span span,String type,Details details){
addToType(span,type);
if(details!=null){
detailMap.put(new SpanTypeKey(span,type),details);
}
}
@Override
public Set<String> getTypes(){
return typeDocumentSetMap.keySet();
}
@Override
public boolean isType(String type){
return typeDocumentSetMap.get(type)!=null;
}
@Override
public void declareType(String type){
// System.out.println("BasicTextLabels: declareType: "+type);
if(type==null)
throw new IllegalArgumentException("null type declared");
if(!isType(type))
typeDocumentSetMap.put(type,new TreeMap<String,SortedSet<Span>>());
}
@Override
public Iterator<Span> instanceIterator(String type){
return new MyNestedSpanLooper(type,false);
}
@Override
public Iterator<Span> instanceIterator(String type,String documentId){
if(documentId!=null)
return getTypeSet(type,documentId).iterator();
else
return instanceIterator(type);
}
@Override
public void defineTypeInside(String type,Span s,Iterator<Span> i){
if(type==null||s.getDocumentId()==null)
throw new IllegalArgumentException("null type defined");
// System.out.println("BTE type: "+type+" documentId: "+s.getDocumentId());
Set<Span> set=lookupTypeSet(type,s.getDocumentId());
// remove all spans currently inside set
for(Iterator<Span> j=set.iterator();j.hasNext();){
Span t=j.next();
if(s.contains(t))
j.remove();
}
// add spans from i to set
while(i.hasNext())
set.add(i.next());
// close the type
closeTypeInside(type,s);
}
@Override
public Details getDetails(Span span,String type){
SpanTypeKey key=new SpanTypeKey(span,type);
Details details=detailMap.get(key);
if(details!=null)
return details;
else
return hasType(span,type)?Details.DEFAULT:null;
}
// get the set of spans with a given type in the given document
// so that it can be modified
protected Set<Span> lookupTypeSet(String type,String documentId){
if(type==null||documentId==null)
throw new IllegalArgumentException("null type?");
SortedMap<String,SortedSet<Span>> documentsWithType=typeDocumentSetMap.get(type);
if(documentsWithType==null){
typeDocumentSetMap.put(type,documentsWithType=new TreeMap<String,SortedSet<Span>>());
}
// System.out.println("BTE type: "+type+" documentId: "+documentId+"
// documentsWithType:" + documentsWithType);
SortedSet<Span> set=documentsWithType.get(documentId);
if(set==null){
documentsWithType.put(documentId,(set=new TreeSet<Span>()));
}
return set;
}
// get the set of spans with a given type in the given document w/o changing
// it
@Override
public Set<Span> getTypeSet(String type,String documentId){
if(type==null||documentId==null)
throw new IllegalArgumentException("null type?");
SortedMap<String,SortedSet<Span>> documentsWithType=typeDocumentSetMap.get(type);
if(documentsWithType==null)
return Collections.EMPTY_SET;
SortedSet<Span> set=documentsWithType.get(documentId);
if(set==null)
return Collections.EMPTY_SET;
return set;
}
private class ObjectStringKey<T extends Comparable<T>> implements Comparable<ObjectStringKey<T>>{
T obj;
String str;
public ObjectStringKey(T o,String s){
this.obj=o;
this.str=s;
}
@Override
public int compareTo(ObjectStringKey<T> b){
String bn=b.obj.getClass().toString();
int tmp=obj.getClass().toString().compareTo(bn);
if(tmp!=0)
return tmp;
tmp=obj.compareTo(b.obj);
if(tmp!=0)
return tmp;
return str.compareTo(b.str);
}
}
private class SpanTypeKey extends ObjectStringKey<Span>{
public SpanTypeKey(Span span,String type){
super(span,"type:"+type);
}
}
private class SpanPropKey extends ObjectStringKey<Span>{
public SpanPropKey(Span span,String prop){
super(span,"prop:"+prop);
}
}
private class TokenPropKey extends ObjectStringKey<String>{
public TokenPropKey(Token token,String prop){
super(token.getValue(),prop);
}
}
//
// maintain assertions about where the closed world assumption holds
//
@Override
public Iterator<Span> closureIterator(String type){
return new MyNestedSpanLooper(type,true);
}
@Override
public Iterator<Span> closureIterator(String type,String documentId){
if(documentId!=null){
return getClosureSet(type,documentId).iterator();
}
else{
return closureIterator(type);
}
}
@Override
public void closeTypeInside(String type,Span s){
getClosureSet(type,s.getDocumentId()).add(s);
}
/**
* get the set of spans with a given type in the given document
*/
private Set<Span> getClosureSet(String type,String documentId){
SortedMap<String,SortedSet<Span>> documentsWithClosure=closureDocumentSetMap.get(type);
if(documentsWithClosure==null){
closureDocumentSetMap.put(type,documentsWithClosure=new TreeMap<String,SortedSet<Span>>());
//closureDocumentSetMap.put(type,documentsWithClosure=typeDocumentSetMap.get(type));
}
SortedSet<Span> set=documentsWithClosure.get(documentId);
if(set==null){
documentsWithClosure.put(documentId,set=new TreeSet<Span>());
}
return set;
}
/** iterate over all spans of a given type */
private class MyNestedSpanLooper implements Iterator<Span>{
private Iterator<Map.Entry<String,SortedSet<Span>>> documentIterator;
private Iterator<Span> spanIterator;
private Span nextSpan;
// private int estimatedSize;
// private boolean getClosures; // if false, get documents
public MyNestedSpanLooper(String type,boolean getClosures){
// System.out.println("building MyNestedSpanLooper for "+type+":
// "+typeDocumentSetMap);
Map<String,SortedSet<Span>> documentMap=getClosures?closureDocumentSetMap.get(type):typeDocumentSetMap.get(type);
if(documentMap==null){
nextSpan=null;
// estimatedSize=0;
}else{
// iterator over the documents in the map
documentIterator=documentMap.entrySet().iterator();
// estimatedSize=documentMap.entrySet().size();
spanIterator=null;
advance();
}
}
/**
* @return Number of documents with the given type
*/
// public int estimatedSize(){
// return estimatedSize;
// }
@Override
public boolean hasNext(){
return nextSpan!=null;
}
@Override
public void remove(){
throw new UnsupportedOperationException("can't remove");
}
@Override
public Span next(){
Span result=nextSpan;
advance();
return result;
}
// public Span nextSpan(){
// return (Span)next();
// }
private void advance(){
if(spanIterator!=null&&spanIterator.hasNext()){
// get next span in the current document
nextSpan=spanIterator.next();
}else if(documentIterator.hasNext()){
// move to the next document
Map.Entry<String,SortedSet<Span>> entry=documentIterator.next();
spanIterator=entry.getValue().iterator();
advance();
}else{
// nothing found
nextSpan=null;
}
}
}
@Override
public String toString(){
return "[BasicTextLabels "+typeDocumentSetMap+"]";
}
/** Dump of all strings that have textTokenuence with the given property */
@Override
public String showTokenProp(TextBase base,String prop){
StringBuffer buf=new StringBuffer();
for(Iterator<Span> i=base.documentSpanIterator();i.hasNext();){
Span span=i.next();
for(int j=0;j<span.size();j++){
Token textToken=span.getToken(j);
if(j>0)
buf.append(" ");
buf.append(textToken.getValue());
String val=getProperty(textToken,prop);
if(val!=null){
buf.append(":"+val);
}
}
buf.append("\n");
}
return buf.toString();
}
@Override
public Viewer toGUI(){
return new ZoomingTextLabelsViewer(this);
}
//
// Implement Saveable interface.
//
static private final String FORMAT_NAME="Minorthird TextLabels";
@Override
public String[] getFormatNames(){
return new String[]{FORMAT_NAME};
}
@Override
public String getExtensionFor(String s){
return ".labels";
}
@Override
public void saveAs(File file,String format) throws IOException{
if(!format.equals(FORMAT_NAME))
throw new IllegalArgumentException("illegal format "+format);
new TextLabelsLoader().saveTypesAsOps(this,file);
}
@Override
public Object restore(File file) throws IOException{
throw new UnsupportedOperationException("Cannot load TextLabels object");
}
}