package ch.akuhn.hapax; import java.io.File; import java.io.InputStream; import ch.akuhn.hapax.corpus.CamelCaseScanner; import ch.akuhn.hapax.corpus.LetterScanner; import ch.akuhn.hapax.corpus.TermScanner; import ch.akuhn.hapax.corpus.WordScanner; import ch.akuhn.hapax.index.GlobalWeighting; import ch.akuhn.hapax.index.LocalWeighting; import ch.akuhn.hapax.index.TermDocumentMatrix; import ch.akuhn.util.Files; public final class CorpusBuilder { TermDocumentMatrix corpus; TermScanner scanner = new WordScanner(); private LocalWeighting local = LocalWeighting.NULL; private GlobalWeighting global = GlobalWeighting.NULL; private boolean rejectStopwords = true; private boolean rejectRareTerms = true; @SuppressWarnings("unused") // TODO private boolean rejectCommonTerms = true; boolean ignoreCase = true; int latentDimensions = 25; public CorpusBuilder(TermDocumentMatrix tdm) { this.corpus = tdm; } public CorpusBuilder() { this(new TermDocumentMatrix()); } public CorpusBuilder addDocument(String doc, String contents) { corpus.putDocument(doc, scanner.fromString(contents)); return this; } public CorpusBuilder addFiles(String folder, String... extensions) { for (File each : Files.find(folder, extensions)) { corpus.putDocument(each.getAbsolutePath(), scanner.fromFile(each)); } return this; } public CorpusBuilder dontUseWeighting() { local = LocalWeighting.NULL; global = GlobalWeighting.NULL; return this; } public TermDocumentMatrix makeTDM() { TermDocumentMatrix tdm = corpus; if (ignoreCase) tdm = tdm.toLowerCase(); if (rejectRareTerms) tdm = tdm.rejectHapaxes(); if (rejectStopwords) tdm = tdm.rejectStopwords(); // TODO if (rejectCommonTerms) tdm = tdm.rejectCommonTerms(); return tdm.weight(local, global); } public CorpusBuilder rejectCommonTerms() { rejectCommonTerms = true; return this; } public CorpusBuilder rejectRareTerms() { rejectRareTerms = true; return this; } public CorpusBuilder rejectStopwords() { rejectStopwords = true; return this; } public CorpusBuilder useCamelCaseScanner() { scanner = new CamelCaseScanner(); return this; } public CorpusBuilder useTFIDF() { local = LocalWeighting.TERM; global = GlobalWeighting.IDF; return this; } public CorpusBuilder useWordScanner() { scanner = new WordScanner(); return this; } public CorpusBuilder useLetterScanner() { scanner = new LetterScanner(); return this; } public CorpusBuilder beCaseSensitiv() { ignoreCase = false; return this; } public CorpusBuilder ignoreCase() { ignoreCase = true; return this; } public CorpusBuilder latentDimensions(int rank) { latentDimensions = rank; return this; } public Hapax build() { return new Hapax(this); } public CorpusBuilder addDocument(String doc, InputStream stream) { corpus.putDocument(doc, scanner.fromInpuStream(stream)); return this; } }