package ch.akuhn.hapax.corpus;
/** Text corpus, with term frequencies for each document.
* Both terms and documents are identified by strings.
*
* @author Adrian Kuhn
*
*/
public abstract class Corpus {
public abstract void putDocument(String name, Terms content);
public Terms terms() {
Terms terms = new Terms();
for (String doc: documents()) terms.addAll(getDocument(doc));
return terms;
}
public abstract Iterable<String> documents();
public abstract int documentCount();
public abstract boolean containsDocument(String doc);
public boolean containsTerm(String term) {
for (String doc: documents()) if (getDocument(doc).contains(term)) return true;
return false;
}
public abstract Terms getDocument(String doc);
public int termCount() {
return terms().uniqueSize();
}
@Override
public String toString() {
return String.format("%s (%d documents, %d terms)",
this.getClass().getName(),
documentCount(),
termCount());
}
}