package org.codemap.tasks; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.util.Collection; import org.codemap.util.Log; import ch.akuhn.hapax.CorpusBuilder; import ch.akuhn.hapax.Hapax; import ch.akuhn.hapax.index.LatentSemanticIndex; import ch.akuhn.util.ProgressMonitor; import ch.akuhn.values.Arguments; import ch.akuhn.values.TaskValue; import ch.akuhn.values.Value; public class ComputeIndexTask extends TaskValue<LatentSemanticIndex> { public ComputeIndexTask(Value<Collection<String>> elements) { super("Creating latent semantic index", elements); } @Override protected LatentSemanticIndex computeValue(ProgressMonitor monitor, Arguments args) { Collection<String> elements = args.nextOrFail(); monitor.begin(elements.size()); CorpusBuilder builder = Hapax.newCorpus() .ignoreCase() .useCamelCaseScanner() .rejectRareTerms() .rejectStopwords() .latentDimensions(25) .useTFIDF(); for (String path: elements) { parseElement(builder, path); monitor.worked(1); } return builder.makeTDM().createIndex(); } protected void parseElement(CorpusBuilder builder, String path) { try { builder.addDocument(path, new FileInputStream(path)); } catch (FileNotFoundException e) { Log.error(e); } } }