BasicTextBase.java example

Explorer
MinorThird-master
package edu.cmu.minorthird.text;

import java.io.Serializable;
import java.util.Iterator;
import java.util.SortedMap;
import java.util.TreeMap;

import org.apache.log4j.Logger;

/**
 * Maintains information about what's in a set of documents. Specifically, this
 * contains a set of character sequences (TextToken's) from some sort of set of
 * containing documents - typically found by tokenization.
 * 
 * @author William Cohen
 * @author Cameron Williams
 * @author Quinten Mercer
 */

public class BasicTextBase extends MutableTextBase implements Serializable{

	// Minorthird administrative stuff
	static Logger log=Logger.getLogger(BasicTextBase.class);

	static private final long serialVersionUID=20080202L;

	// Underlying document store.
	private SortedMap<String,Document> documentMap=new TreeMap<String,Document>();

	// map documentId to name of 'group' of documents it belongs to
	private SortedMap<String,String> documentGroupMap=
			new TreeMap<String,String>();

	/** Default constructor creates a new TextBase with the default Tokenizer. */
	public BasicTextBase(){
		super(new RegexTokenizer());
	}

	/**
	 * Constructor that specifies a custom Tokenizer to be used with this
	 * TextBase.
	 */
	public BasicTextBase(Tokenizer t){
		super(t);
	}

	//
	// Implementations of MutableTextBase abstract methods
	//
	/**
	 * Adds a document to this TextBase with documentId as its identifier and with
	 * text specified by documentString.
	 */
	@Override
	public void loadDocument(String documentId,String documentString){
		// create the document and add the tokens to that document
		Document document=new Document(documentId,documentString);
		TextToken[] tokenArray=getTokenizer().splitIntoTokens(document);
		document.setTokens(tokenArray);
		documentMap.put(documentId,document);
	}

	/**
	 * Adds a document to this TextBase with documentId as its identifier and with
	 * text specified by documentString. Also, this method sets the offset
	 * parameter in the new Document to the specified charOffset.
	 */
	@Override
	public void loadDocument(String documentId,String documentString,
			int charOffset){
		// create the document and add the tokens to that document
		Document document=new Document(documentId,documentString,charOffset);
		TextToken[] tokenArray=getTokenizer().splitIntoTokens(document);
		document.setTokens(tokenArray);
		documentMap.put(documentId,document);
	}

	/**
	 * Sets the document group id for the specified documentId to the specified
	 * document group id.
	 */
	@Override
	public void setDocumentGroupId(String documentId,String documentGroupId){
		documentGroupMap.put(documentId,documentGroupId);
	}

	/** Returns the number of documents currently in this TextBase. */
	@Override
	public int size(){
		return documentMap.size();
	}

	/**
	 * Returns the Document instance that corresponds to the specified documentId
	 * or null if no document exists with the specified documentId.
	 */
	@Override
	public Document getDocument(String documentId){
		return documentMap.get(documentId);
	}

	/**
	 * Returns a Span instance that encloses all of the tokens in the document
	 * specified by documentId. Note that this Span instance will NOT include any
	 * white space that comes before the first token or after the last token.
	 */
	@Override
	public Span documentSpan(String documentId){
		TextToken[] textTokens=getTokenArray(documentId);
		if(textTokens==null)
			return null;
		else
			return new BasicSpan(documentId,textTokens,0,textTokens.length,
					documentGroupMap.get(documentId));
	}

	/**
	 * Returns a Span.Looper instance that includes a document span for every
	 * document in this TextBase.
	 */
	@Override
	public Iterator<Span> documentSpanIterator(){
		return new MyDocumentSpanIterator();
	}

	/** Helper class that is used to iterate through document spans. */
	private class MyDocumentSpanIterator implements Iterator<Span>{

		private Iterator<String> k=documentMap.keySet().iterator();

		@Override
		public void remove(){
			throw new UnsupportedOperationException(
					"Cannot remove documents from a TextBase.");
		}

		@Override
		public boolean hasNext(){
			return k.hasNext();
		}

		@Override
		public Span next(){
			String documentId=k.next();
			TextToken[] textTokens=getTokenArray(documentId);
			Span s=new BasicSpan(documentId,textTokens,0,textTokens.length,documentGroupMap.get(documentId));
			s.setCharOffset(getOffset(documentId));
			return s;
		}

//		public int estimatedSize(){
//			return documentMap.keySet().size();
//		}
	}

	private int getOffset(String documentId){
		Document document=documentMap.get(documentId);
		if(document!=null)
			return document.charOffset;
		else
			return -1;
	}

	/**
	 * Helper method used internally to make getting at the token array for a
	 * specific document id easier.
	 */
	private TextToken[] getTokenArray(String documentId){
		Document document=documentMap.get(documentId);
		if(document!=null)
			return document.getTokens();
		return null;
	}

	//
	// basic test routine that loads each argument as a document, then iterates
	// through them printing them out.
	//    
	static public void main(String[] args){
		BasicTextBase b=new BasicTextBase();
		for(int i=0;i<args.length;i++){
			b.loadDocument("arg_"+i,args[i]);
		}
		for(Iterator<Span> i=b.documentSpanIterator();i.hasNext();){
			System.out.println(i.next());
		}
	}
}