/* Copyright 2003, Carnegie Mellon, All Rights Reserved */ package edu.cmu.minorthird.text; import java.util.Iterator; /** Maintains information about what's in a set of documents. * Specifically, this contains a set of character sequences (TextToken's) * from some sort of set of containing documents - typically found by * tokenization. * * @author William Cohen * @author Quinten Mercer */ public interface TextBase { /** Returns the {@link edu.cmu.minorthird.text.Tokenizer} used on the documents in this text base. */ Tokenizer getTokenizer(); /** Returns the number of documents contained in this TextBase. */ public int size(); /** Returns the {@link Document} with the given ID */ public Document getDocument(String docID); /** Returns an iterator over the documents in this TextBase. */ public Iterator<Span> documentSpanIterator(); /** Looks up the document Span for the given documentId. Returns the Span or null if a document with documentId was not found in this TextBase. */ public Span documentSpan(String documentId); }