package edu.cmu.minorthird.text; import java.io.Serializable; import org.apache.log4j.Logger; import edu.cmu.minorthird.text.gui.SpanViewer; import edu.cmu.minorthird.util.gui.Viewer; import edu.cmu.minorthird.util.gui.Visible; /** Implements the Span interface. * * @author William Cohen */ public class BasicSpan implements Span,Serializable,Visible{ static private Logger log=Logger.getLogger(BasicSpan.class); static private final boolean DEBUG=log.isDebugEnabled(); static private final long serialVersionUID=20080303L; private String documentId; private String documentGroupId; public int loTextTokenIndex,loCharIndex=-1,hiCharIndex=-1; private int charOffset=0; private int spanLen; // The number of tokens in the span. private TextToken[] textTokens; private String text=null; /** Constructor assumes that the textTokens are all from the documentId. */ public BasicSpan(String documentId,TextToken[] textTokens, int loTextTokenIndex,int spanLen,String documentGroupId){ this.documentId=documentId; this.textTokens=textTokens; this.loTextTokenIndex=loTextTokenIndex; this.spanLen=spanLen; this.documentGroupId=documentGroupId==null?documentId:documentGroupId; } @Override public String getDocumentId(){ return documentId; } @Override public String getDocumentGroupId(){ return documentGroupId; } @Override public String getDocumentContents(){ if(textTokens.length==0) return ""; else return textTokens[0].getDocument(); } /** Returns the number of tokens in the span. */ @Override public int size(){ return spanLen; } /** Retrieves the ith TextToken in the span */ @Override public TextToken getTextToken(int i){ if(i<0||i>=spanLen) throw new IllegalArgumentException("out of range: "+i); return textTokens[loTextTokenIndex+i]; } /** Retrieves the ith TextToken in the span */ @Override public Token getToken(int i){ //return getTextToken(i); if(i<0||i>=spanLen) throw new IllegalArgumentException("out of range: "+i); return textTokens[loTextTokenIndex+i]; } /** Create a new BasicSpan, covering the indicated TextToken's. */ @Override public Span subSpan(int start,int len){ if(start<0||start+len>spanLen) throw new IllegalArgumentException("out of range: "+start+","+len); return new BasicSpan(documentId,textTokens,loTextTokenIndex+start,len, documentGroupId); } /** Create a SubSpan of this span, covering the indicated TextToken's. */ public SubSpan subSpan(int startIndex,int start,int len){ if(start<0||start+len>spanLen) throw new IllegalArgumentException("out of range: "+start+","+len); return new SubSpan(documentId,textTokens,loTextTokenIndex+start,len, documentGroupId,startIndex); } /** A larger span containing this span. */ @Override public Span documentSpan(){ return new BasicSpan(documentId,textTokens,0,textTokens.length, documentGroupId); } /** The index of this span in the home span. */ @Override public int documentSpanStartIndex(){ return loTextTokenIndex; } @Override public boolean contains(Span other){ if(!other.getDocumentId().equals(getDocumentId())) return false; int myStart=documentSpanStartIndex(); int otherStart=other.documentSpanStartIndex(); int myEnd=documentSpanStartIndex()+size(); int otherEnd=other.documentSpanStartIndex()+other.size(); return(myStart<=otherStart&&myEnd>=otherStart&&myStart<=otherEnd&&myEnd>=otherEnd); } @Override public boolean overlaps(Span other){ if(!other.getDocumentId().equals(getDocumentId())) return false; int myStart=documentSpanStartIndex(); int otherStart=other.documentSpanStartIndex(); int myEnd=documentSpanStartIndex()+size(); int otherEnd=other.documentSpanStartIndex()+other.size(); return(myStart<=otherStart&&myEnd>=otherStart // [ ... ( ... ] - partial containment 1 ||myStart<=otherEnd&&myEnd>=otherEnd // [ ... ) ... ] - partial containment 2 ||other.contains(this) // ( .. [ ... ] ... ) - containment ); } /** Find the string contained in a Span. */ @Override public String asString(){ if(size()<=0) return ""; else if(text==null){ TextToken lo=getTextToken(0); TextToken hi=getTextToken(size()-1); text=lo.getDocument().substring(lo.getLo(),hi.getHi()); } return text; } /** A length-zero span for the left boundary */ @Override public Span getLeftBoundary(){ return new BasicSpan(documentId,textTokens,loTextTokenIndex,0, documentGroupId); } /** A length-zero span for the left boundary */ @Override public Span getRightBoundary(){ return new BasicSpan(documentId,textTokens,loTextTokenIndex+spanLen,0, documentGroupId); } // Implement comparable @Override public int compareTo(Span other){ int cmp1=getDocumentId().compareTo(other.getDocumentId()); if(cmp1!=0) return cmp1; int cmp2=documentSpanStartIndex()-other.documentSpanStartIndex(); if(cmp2!=0) return cmp2; int cmp3=size()-other.size(); if(cmp3!=0) return cmp3; return 0; } // for safe hashing @Override public int hashCode(){ return documentId.hashCode()^loTextTokenIndex^spanLen; } @Override public boolean equals(Object o){ return o instanceof BasicSpan&&compareTo((Span)o)==0; } @Override public String toString(){ StringBuffer buf=new StringBuffer(""); buf.append("Span '"+asString()+"'"); buf.append(" = tokens "+loTextTokenIndex+":"+(loTextTokenIndex+spanLen)+ " in "); buf.append(documentId+"/"+documentGroupId); return buf.toString(); } @Override public Span charIndexSubSpan(int lo,int hi){ return charIndexSubSpan(lo,hi,false); } @Override public Span charIndexProperSubSpan(int lo,int hi){ return charIndexSubSpan(lo,hi,true); } @Override public void setCharOffset(int charOffset){ this.charOffset=charOffset; } @Override public int getCharOffset(){ return charOffset; } /** Converts from a span in character offsets within a document * Span to a token span for that document Span. */ private Span charIndexSubSpan(int lo,int hi,boolean proper){ loCharIndex=lo; hiCharIndex=hi; // find token that start & end closest to lo and hi int minStartDist=Integer.MAX_VALUE; int minEndDist=Integer.MAX_VALUE; int firstTextToken=-1,lastTextToken=-1; for(int i=0;i<size();i++){ if(!proper){ if(DEBUG) log.debug("considering token '"+getTextToken(i)+"' from lo="+ getTextToken(i).getLo()+" to hi="+getTextToken(i).getHi()); int startDist=distance(getTextToken(i).getLo(),lo); int endDist=distance(getTextToken(i).getHi(),hi); // <= prefers later start, end tokens if(startDist<=minStartDist){ minStartDist=startDist; firstTextToken=i; if(DEBUG) log.debug("minStartDist => "+minStartDist+" for token "+ getTextToken(i)); } if(endDist<=minEndDist){ minEndDist=endDist; lastTextToken=i; if(DEBUG) log .debug("minEndDist => "+minEndDist+" for token "+ getTextToken(i)); } }else{ // The lo character offset may lie on a whitespace character before a token, at the // boundry of a token, or in the middle of a token. In any of these cases we want to // make sure to include this token in the span. So simply check that the lo character // offset is less than the hi index of the token. That is check that at least one // character of the token is included in the char offsets. if(firstTextToken<0&&lo<=getTextToken(i).getHi()){ firstTextToken=i; if(DEBUG) log.debug("firstTextToken => "+getTextToken(i)); } /* OLD LOGIC HERE: if (getTextToken(i).getLo()>=lo && firstTextToken<0) { firstTextToken = i; if (DEBUG) log.debug("firstTextToken => "+getTextToken(i)); } */ // The hi character offset may lie on a whitespace character after a token, at the // boundry of a token, or in the middle of a token. Again, we need to include this // token in the subspan in any of these cases. So continually increas the last // included token index (lastTextToken) for each subsequent token that is included in // in the span. Do this by simply checking that the hi character offset is greater // than the token's lo character index. if(hi>getTextToken(i).getLo()){ lastTextToken=i; if(DEBUG) log.debug("lastTextToken => "+getTextToken(i)); } /* OLD LOGIC HERE if (getTextToken(i).getHi()<=hi) { lastTextToken = i; if (DEBUG) log.debug("lastTextToken => "+getTextToken(i)); } */ } } if(firstTextToken<0||lastTextToken<0){ throw new IllegalArgumentException("no proper subspan for lo="+lo+" hi="+ hi+" for: "+this); } return subSpan(loCharIndex,firstTextToken,lastTextToken-firstTextToken+1); } @Override public int getLoTextToken(){ return loTextTokenIndex; } /** Returns how many characters are before the span in the document */ @Override public int getLoChar(){ return textTokens[loTextTokenIndex].getLo(); } /** Returns how many characters there are up to and including the span */ @Override public int getHiChar(){ return textTokens[loTextTokenIndex+size()-1].getHi(); } private int distance(int i,int j){ return(i>=j?i-j:j-i); } @Override public Viewer toGUI(){ return new SpanViewer.ControlledTextViewer(this); //return new SpanViewer.TextViewer(this); } }