package ch.akuhn.hapax; import ch.akuhn.hapax.corpus.TermScanner; import ch.akuhn.hapax.corpus.Terms; import ch.akuhn.hapax.index.LatentSemanticIndex; import ch.akuhn.hapax.index.Ranking; import ch.akuhn.hapax.index.TermDocumentMatrix; /** Searchable index of a text corpus. * * @author Adrian Kuhn, 2009. * */ public final class Hapax { private TermScanner scanner; private boolean ignoreCase; private LatentSemanticIndex latentIndex; public Hapax(CorpusBuilder corpusBuilder) { this.scanner = corpusBuilder.scanner; this.ignoreCase = corpusBuilder.ignoreCase; this.latentIndex = corpusBuilder.makeTDM().createIndex(corpusBuilder.latentDimensions); } public static CorpusBuilder newCorpus() { return new CorpusBuilder(); } public Ranking<String> find(String content) { Terms query = new Terms(); scanner.newInstance().client(query).onString(content).run(); if (ignoreCase) query = query.toLowerCase(); return latentIndex.rankDocumentsByQuery(query); } public synchronized void updateDocument(String doc, String contents) { Terms document = scanner.fromString(contents); if (ignoreCase) document = document.toLowerCase(); latentIndex.updateDocument(doc, document); } public synchronized void removeDocument(String doc) { latentIndex.removeDocument(doc); } public LatentSemanticIndex getIndex() { return latentIndex; } public static CorpusBuilder withCorpus(TermDocumentMatrix tdm) { return new CorpusBuilder(tdm); } }