package edu.cmu.minorthird.text;
import java.io.Serializable;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.TreeMap;
import org.apache.log4j.Logger;
/**
* Maintains information about what's in a set of documents. Specifically, this
* contains a set of character sequences (TextToken's) from some sort of set of
* containing documents - typically found by tokenization.
*
* @author William Cohen
* @author Cameron Williams
* @author Quinten Mercer
*/
public class BasicTextBase extends MutableTextBase implements Serializable{
// Minorthird administrative stuff
static Logger log=Logger.getLogger(BasicTextBase.class);
static private final long serialVersionUID=20080202L;
// Underlying document store.
private SortedMap<String,Document> documentMap=new TreeMap<String,Document>();
// map documentId to name of 'group' of documents it belongs to
private SortedMap<String,String> documentGroupMap=
new TreeMap<String,String>();
/** Default constructor creates a new TextBase with the default Tokenizer. */
public BasicTextBase(){
super(new RegexTokenizer());
}
/**
* Constructor that specifies a custom Tokenizer to be used with this
* TextBase.
*/
public BasicTextBase(Tokenizer t){
super(t);
}
//
// Implementations of MutableTextBase abstract methods
//
/**
* Adds a document to this TextBase with documentId as its identifier and with
* text specified by documentString.
*/
@Override
public void loadDocument(String documentId,String documentString){
// create the document and add the tokens to that document
Document document=new Document(documentId,documentString);
TextToken[] tokenArray=getTokenizer().splitIntoTokens(document);
document.setTokens(tokenArray);
documentMap.put(documentId,document);
}
/**
* Adds a document to this TextBase with documentId as its identifier and with
* text specified by documentString. Also, this method sets the offset
* parameter in the new Document to the specified charOffset.
*/
@Override
public void loadDocument(String documentId,String documentString,
int charOffset){
// create the document and add the tokens to that document
Document document=new Document(documentId,documentString,charOffset);
TextToken[] tokenArray=getTokenizer().splitIntoTokens(document);
document.setTokens(tokenArray);
documentMap.put(documentId,document);
}
/**
* Sets the document group id for the specified documentId to the specified
* document group id.
*/
@Override
public void setDocumentGroupId(String documentId,String documentGroupId){
documentGroupMap.put(documentId,documentGroupId);
}
/** Returns the number of documents currently in this TextBase. */
@Override
public int size(){
return documentMap.size();
}
/**
* Returns the Document instance that corresponds to the specified documentId
* or null if no document exists with the specified documentId.
*/
@Override
public Document getDocument(String documentId){
return documentMap.get(documentId);
}
/**
* Returns a Span instance that encloses all of the tokens in the document
* specified by documentId. Note that this Span instance will NOT include any
* white space that comes before the first token or after the last token.
*/
@Override
public Span documentSpan(String documentId){
TextToken[] textTokens=getTokenArray(documentId);
if(textTokens==null)
return null;
else
return new BasicSpan(documentId,textTokens,0,textTokens.length,
documentGroupMap.get(documentId));
}
/**
* Returns a Span.Looper instance that includes a document span for every
* document in this TextBase.
*/
@Override
public Iterator<Span> documentSpanIterator(){
return new MyDocumentSpanIterator();
}
/** Helper class that is used to iterate through document spans. */
private class MyDocumentSpanIterator implements Iterator<Span>{
private Iterator<String> k=documentMap.keySet().iterator();
@Override
public void remove(){
throw new UnsupportedOperationException(
"Cannot remove documents from a TextBase.");
}
@Override
public boolean hasNext(){
return k.hasNext();
}
@Override
public Span next(){
String documentId=k.next();
TextToken[] textTokens=getTokenArray(documentId);
Span s=new BasicSpan(documentId,textTokens,0,textTokens.length,documentGroupMap.get(documentId));
s.setCharOffset(getOffset(documentId));
return s;
}
// public int estimatedSize(){
// return documentMap.keySet().size();
// }
}
private int getOffset(String documentId){
Document document=documentMap.get(documentId);
if(document!=null)
return document.charOffset;
else
return -1;
}
/**
* Helper method used internally to make getting at the token array for a
* specific document id easier.
*/
private TextToken[] getTokenArray(String documentId){
Document document=documentMap.get(documentId);
if(document!=null)
return document.getTokens();
return null;
}
//
// basic test routine that loads each argument as a document, then iterates
// through them printing them out.
//
static public void main(String[] args){
BasicTextBase b=new BasicTextBase();
for(int i=0;i<args.length;i++){
b.loadDocument("arg_"+i,args[i]);
}
for(Iterator<Span> i=b.documentSpanIterator();i.hasNext();){
System.out.println(i.next());
}
}
}