package uk.ac.shef.dcs.jate.app;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.algorithm.Weirdness;
import uk.ac.shef.dcs.jate.feature.FrequencyTermBased;
import uk.ac.shef.dcs.jate.feature.FrequencyTermBasedFBMaster;
import uk.ac.shef.dcs.jate.feature.TTFReferenceFeatureFileBuilder;
import uk.ac.shef.dcs.jate.model.JATETerm;
import java.io.IOException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class AppWeirdness extends App {
private final Logger log = LoggerFactory.getLogger(AppWeirdness.class.getName());
/**
* @param args
* command-line params accepting solr home path, solr core name
* <p>
* more optional run-time parameters
* @see uk.ac.shef.dcs.jate.app.AppParams
* <p>
* Weirdness required parameter: reference frequency file
* @see AppParams#REFERENCE_FREQUENCY_FILE
*/
public static void main(String[] args) {
if (args.length < 1) {
printHelp();
System.exit(1);
}
String solrHomePath = args[args.length - 2];
String solrCoreName = args[args.length - 1];
Map<String, String> params = getParams(args);
String jatePropertyFile = getJATEProperties(params);
String corpusDir = getCorpusDir(params);
List<JATETerm> terms;
try {
App weirdness = new AppWeirdness(params);
if (isCorpusProvided(corpusDir)) {
weirdness.index(Paths.get(corpusDir), Paths.get(solrHomePath), solrCoreName, jatePropertyFile);
}
terms = weirdness.extract(solrHomePath, solrCoreName, jatePropertyFile);
if (isExport(params)) {
weirdness.write(terms);
}
System.exit(0);
} catch (IOException e) {
e.printStackTrace();
} catch (JATEException e) {
e.printStackTrace();
}
}
/**
* @param initParams
* pre-filtering, post-filtering parameters and Weirdness
* specific parameter
* @throws JATEException
* @see AppParams
* @see AppParams#REFERENCE_FREQUENCY_FILE
*/
public AppWeirdness(Map<String, String> initParams) throws JATEException {
super(initParams);
initalizeRefFreqParam(initParams);
}
@Override
public List<JATETerm> extract(SolrCore core, String jatePropertyFile) throws IOException, JATEException {
JATEProperties properties = getJateProperties(jatePropertyFile);
return extract(core, properties);
}
public List<JATETerm> extract(SolrCore core, JATEProperties properties) throws JATEException {
SolrIndexSearcher searcher = core.getSearcher().get();
try {
this.freqFeatureBuilder = new FrequencyTermBasedFBMaster(searcher, properties, 0);
this.freqFeature = (FrequencyTermBased) freqFeatureBuilder.build();
FrequencyTermBasedFBMaster fwbb = new FrequencyTermBasedFBMaster(searcher, properties, 1);
FrequencyTermBased fwb = (FrequencyTermBased) fwbb.build();
TTFReferenceFeatureFileBuilder ftrb = new TTFReferenceFeatureFileBuilder(this.referenceFrequencyFilePath);
FrequencyTermBased frb = ftrb.build();
Weirdness weirdness = new Weirdness();
weirdness.registerFeature(FrequencyTermBased.class.getName() + Weirdness.SUFFIX_WORD, fwb);
weirdness.registerFeature(FrequencyTermBased.class.getName() + Weirdness.SUFFIX_REF, frb);
List<String> candidates = new ArrayList<>(this.freqFeature.getMapTerm2TTF().keySet());
filterByTTF(candidates);
List<JATETerm> terms = weirdness.execute(candidates);
terms = cutoff(terms);
addAdditionalTermInfo(terms, searcher, properties.getSolrFieldNameJATENGramInfo(),
properties.getSolrFieldNameID());
return terms;
} finally {
try {
searcher.close();
} catch (IOException e) {
log.error(e.toString());
}
}
}
protected static void printHelp() {
StringBuilder sb = new StringBuilder("Weirdness Usage:\n");
sb.append("java -cp '[CLASSPATH]' ").append(AppATTF.class.getName()).append(" [OPTIONS] ")
.append("-r [REF_TERM_TF_FILE] [LUCENE_INDEX_PATH] [JATE_PROPERTY_FILE]").append("\nE.g.:\n");
sb.append(
"java -cp '/libs/*' -t 20 -r /resource/bnc_unifrqs.normal /solr/server/solr/jate/data jate.properties ...\n\n");
sb.append("[OPTIONS]:\n")
.append("\t\t-c\t\t'true' or 'false'. Whether to collect term information, e.g., offsets in documents. Default is false.\n")
.append("\t\t-t\t\tA number. Score threshold for selecting terms. If not set then default -n is used.")
.append("\n")
.append("\t\t-n\t\tA number. If an integer is given, top N candidates are selected as terms. \n")
.append("\t\t\t\tIf a decimal number is given, top N% of candidates are selected. Default is 0.25.\n");
sb.append("\t\t-o\t\tA file path. If provided, the output is written to the file. \n")
.append("\t\t\t\tOtherwise, output is written to the console.");
System.out.println(sb);
}
}