package eu.project.ttc.eval; import java.nio.file.Paths; import com.google.common.base.Charsets; import eu.project.ttc.api.TerminoExtractor; import eu.project.ttc.api.TerminoExtractor.ContextualizerMode; import eu.project.ttc.api.TerminoFilterConfig; import eu.project.ttc.engines.cleaner.TermProperty; import eu.project.ttc.engines.desc.Lang; public class TerminoConfig { private int scope = 3; private boolean swtOnly = true; private int frequencyTh = 2; private int coocFrequencyTh = 1; public TerminoConfig() { super(); } public int getScope() { return scope; } public TerminoConfig setScope(int scope) { this.scope = scope; return this; } public boolean isSwtOnly() { return swtOnly; } public TerminoConfig setSwtOnly(boolean swtOnly) { this.swtOnly = swtOnly; return this; } public int getFrequencyTh() { return frequencyTh; } public TerminoConfig setFrequencyTh(int frequencyTh) { this.frequencyTh = frequencyTh; return this; } public int getCoocFrequencyTh() { return coocFrequencyTh; } public TerminoConfig setCoocFrequencyTh(int coocFrequencyTh) { throw new UnsupportedOperationException("Not yet implemented"); } public TerminoExtractor toExtractor(Lang lang, Corpus corpus) { String corpusDir = Paths.get(corpus.getRootDir().toString(), lang.getName()).toString(); TerminoExtractor extractor = TerminoExtractor .fromTxtCorpus(lang, corpusDir, "**/*.txt", Charsets.UTF_8.name()) .setTreeTaggerHome(TermSuiteEvals.getTreeTaggerPath().toString()) .disableScoring() .disableVariationDetection() .useContextualizer(scope, swtOnly ? ContextualizerMode.ON_SWT_TERMS : ContextualizerMode.ON_ALL_TERMS); if(frequencyTh > 1) extractor.preFilter(new TerminoFilterConfig().by(TermProperty.FREQUENCY).keepOverTh(frequencyTh)); return extractor; } @Override public String toString() { return String.format("scope%d-th%d", scope, frequencyTh); } }