package uk.ac.shef.dcs.jate.app; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SolrIndexSearcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.algorithm.RAKE; import uk.ac.shef.dcs.jate.feature.*; import uk.ac.shef.dcs.jate.model.JATETerm; import java.io.IOException; import java.nio.file.Paths; import java.util.ArrayList; import java.util.List; import java.util.Map; public class AppRAKE extends App { private final Logger log = LoggerFactory.getLogger(AppRAKE.class.getName()); /** * * @param args * command-line params accepting solr home path, solr core name * and more optional run-time parameters * @see uk.ac.shef.dcs.jate.app.AppParams */ public static void main(String[] args) { if (args.length < 1) { printHelp(); System.exit(1); } String solrHomePath = args[args.length - 2]; String solrCoreName = args[args.length - 1]; Map<String, String> params = getParams(args); String jatePropertyFile = getJATEProperties(params); String corpusDir = getCorpusDir(params); List<JATETerm> terms; try { App appRake = new AppRAKE(params); if (isCorpusProvided(corpusDir)) { appRake.index(Paths.get(corpusDir), Paths.get(solrHomePath), solrCoreName, jatePropertyFile); } terms = appRake.extract(solrHomePath, solrCoreName, jatePropertyFile); if (isExport(params)) { appRake.write(terms); } System.exit(0); } catch (IOException e) { e.printStackTrace(); } catch (JATEException e) { e.printStackTrace(); } } /** * initialise run-time parameters for current algorithm * * @param initParams * run-time parameters (e.g.,min term total freq, cutoff scoring * threshold) for current algorithm * @see uk.ac.shef.dcs.jate.app.AppParams * @throws JATEException */ public AppRAKE(Map<String, String> initParams) throws JATEException { super(initParams); } public AppRAKE() { } @Override public List<JATETerm> extract(SolrCore core, String jatePropertyFile) throws IOException, JATEException { JATEProperties properties = getJateProperties(jatePropertyFile); return extract(core, properties); } public List<JATETerm> extract(SolrCore core, JATEProperties properties) throws JATEException { SolrIndexSearcher searcher = core.getSearcher().get(); try { this.freqFeatureBuilder = new FrequencyTermBasedFBMaster(searcher, properties, 0); this.freqFeature = (FrequencyTermBased) freqFeatureBuilder.build(); FrequencyTermBasedFBMaster fwbb = new FrequencyTermBasedFBMaster(searcher, properties, 1); FrequencyTermBased fwb = (FrequencyTermBased) fwbb.build(); TermComponentIndexFBMaster tcib = new TermComponentIndexFBMaster(properties, new ArrayList<>(this.freqFeature.getMapTerm2TTF().keySet())); TermComponentIndex termComponentIndex = (TermComponentIndex) tcib.build(); RAKE rake = new RAKE(); rake.registerFeature(FrequencyTermBased.class.getName() + RAKE.SUFFIX_TERM, this.freqFeature); rake.registerFeature(FrequencyTermBased.class.getName() + RAKE.SUFFIX_WORD, fwb); rake.registerFeature(TermComponentIndex.class.getName(), termComponentIndex); List<String> candidates = new ArrayList<>(this.freqFeature.getMapTerm2TTF().keySet()); filterByTTF(candidates); List<JATETerm> terms = rake.execute(candidates); terms = cutoff(terms); addAdditionalTermInfo(terms, searcher, properties.getSolrFieldNameJATENGramInfo(), properties.getSolrFieldNameID()); return terms; } finally { try { searcher.close(); } catch (IOException e) { log.error(e.toString()); } } } protected static void printHelp() { StringBuilder sb = new StringBuilder("RAKE, usage:\n"); sb.append("java -cp '[CLASSPATH]' ").append(AppATTF.class.getName()).append(" [OPTIONS] ") .append("[LUCENE_INDEX_PATH] [JATE_PROPERTY_FILE]").append("\nE.g.:\n"); sb.append("java -cp '/libs/*' -t 20 /solr/server/solr/jate/data jate.properties\n\n"); sb.append("[OPTIONS]:\n") .append("\t\t-c\t\t'true' or 'false'. Whether to collect term information, e.g., offsets in documents. Default is false.\n") .append("\t\t-t\t\tA number. Score threshold for selecting terms. If not set then default -n is used.") .append("\n") .append("\t\t-n\t\tA number. If an integer is given, top N candidates are selected as terms. \n") .append("\t\t\t\tIf a decimal number is given, top N% of candidates are selected. Default is 0.25.\n"); sb.append("\t\t-o\t\tA file path. If provided, the output is written to the file. \n") .append("\t\t\t\tOtherwise, output is written to the console.\n") .append("\t\t-mttf\t\tA number. Min total fequency of a term for it to be considered for co-occurrence computation. \n") .append("\t\t-mtcf\t\tA number. Min frequency of a term appearing in different context for it to be considered for co-occurrence computation. \n"); System.out.println(sb); } }