package uk.ac.shef.dcs.jate.app; import com.google.gson.Gson; import org.apache.commons.lang.exception.ExceptionUtils; import org.apache.lucene.index.LeafReader; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.core.SolrCore; import org.apache.solr.search.SolrIndexSearcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.algorithm.TermInfoCollector; import uk.ac.shef.dcs.jate.feature.FrequencyTermBased; import uk.ac.shef.dcs.jate.feature.FrequencyTermBasedFBMaster; import uk.ac.shef.dcs.jate.model.JATEDocument; import uk.ac.shef.dcs.jate.model.JATETerm; import uk.ac.shef.dcs.jate.util.IOUtil; import uk.ac.shef.dcs.jate.util.JATEUtil; import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; public abstract class App { private final Logger log = LoggerFactory.getLogger(getClass()); /** * corresponding to "-c" in command line * <p> * 'true' or 'false'. Whether to collect offsets of term occurrences in the corpus * and save in the output. Default is false. */ protected Boolean collectTermInfo = false; /** * Three cutoff options to seperate real terms from non-terms. All values are inclusive * */ /** * a cut off score */ protected Double cutoffThreshold = null; /** * select highest ranked K */ protected Integer cutoffTopK = null; /** * select highst ranked K% */ protected Double cutoffTopKPercent = null; /** * corresponding to "-o" in command line * <p> * file path. If provided, the output list of terms is written to the file. Otherwise, * output is written to the console. */ protected String outputFile = null; // Min total fequency of a term protected Integer prefilterMinTTF = 0; // Min frequency of a term appearing in different context protected Integer prefilterMinTCF = 0; //used by algorithms such as weirdness, glossex, termex that compares against a reference corpus protected String referenceFrequencyFilePath = null; protected FrequencyTermBasedFBMaster freqFeatureBuilder = null; // term indexed feature (typically frequency info.) // see also {@code AppATTF} protected FrequencyTermBased freqFeature = null; private static String DEFAULT_OUTPUT_FILE = "terms.txt"; public App() { } protected static boolean isExport(Map<String, String> params) { return params.containsKey(AppParams.OUTPUT_FILE.getParamKey()); } /** * if corpus provided, perform indexing first and then ranking & filtering * * @param corpusDir * @return true if corpus is provided otherwise false */ protected static boolean isCorpusProvided(String corpusDir) { return corpusDir != null && !corpusDir.isEmpty(); } private int parseIntParam(String name, String value) throws JATEException { try { return Integer.parseInt(value); } catch (NumberFormatException nfe) { String msg = String.format("%s is not set correctly. An integer value is expected. " + "Actual input is %s", name, value); log.error(msg); throw new JATEException(msg); } } private double parseDoubleParam(String name, String value) throws JATEException { try { return Double.parseDouble(value); } catch (NumberFormatException nfe) { String msg = String.format("%s is not set correctly. An integer value is expected. " + "Actual input is %s", name, value); log.error(msg); throw new JATEException(msg); } } /** * Initialise common run-time parameters * * @param params, command line run-time parameters (paramKey, value) for term * ranking algorithms * @throws JATEException * @see AppParams * @see AppParams */ App(Map<String, String> params) throws JATEException { if (params.containsKey(AppParams.CUTOFF_TOP_K.getParamKey())) { String topKSetting = params.get(AppParams.CUTOFF_TOP_K.getParamKey()); this.cutoffTopK = parseIntParam("Cutoff parameter Top K " + AppParams.CUTOFF_TOP_K.getParamKey() , topKSetting); log.debug(String.format("Cutoff parameter: top [%s] term candidates will be selected as final terms", topKSetting)); } if (params.containsKey(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey())) { String topPercSetting = params.get(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey()); this.cutoffTopKPercent = parseDoubleParam("Cutoff parameter Top K% " + AppParams.CUTOFF_TOP_K_PERCENT.getParamKey() , topPercSetting); log.debug(String.format("Cutoff parameter: top [%s] percent of term candidates will be selected as final terms", topPercSetting)); } if (params.containsKey(AppParams.CUTOFF_THRESHOLD.getParamKey())) { String cutOffThreshold = params.get(AppParams.CUTOFF_THRESHOLD.getParamKey()); this.cutoffThreshold = parseDoubleParam("Cutoff parameter term score " + AppParams.CUTOFF_THRESHOLD. getParamKey(), cutOffThreshold); log.debug(String.format("Cutoff paramter: terms with a minimum score of [%s] will be selected as final terms", cutOffThreshold)); } if (params.containsKey(AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY.getParamKey())) { String minTCF = params.get(AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY.getParamKey()); this.prefilterMinTCF = parseIntParam("Pre-filter minimum term context frequency " + AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY, minTCF); log.debug(String.format("Pre-filter mininum term context frequency (used by co-occurrence based methods) is set to [%s]", prefilterMinTCF)); } if (params.containsKey(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey())) { String minTTF = params.get(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey()); this.prefilterMinTTF = parseIntParam("Pre-filter minimum total term frequency " + AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY, minTTF); log.debug(String.format("Pre-filter mininum total term frequency is set to [%s]", prefilterMinTCF)); } if (params.containsKey(AppParams.COLLECT_TERM_INFO.getParamKey())) { String collectTermOffsets = params.get(AppParams.COLLECT_TERM_INFO.getParamKey()); if (collectTermOffsets != null && collectTermOffsets.equalsIgnoreCase("true")) { this.collectTermInfo = true; log.debug("Term offsets will be collected and written to the output"); } } if (params.containsKey(AppParams.OUTPUT_FILE.getParamKey())) { String outFile = params.get(AppParams.OUTPUT_FILE.getParamKey()); String msg = "Output file is missing or its path is invalid (you can ignore this if you are running " + "in the Plugin mode and do not require the list of terms to be exported to a file.) \n" + "Output will be written to a default file 'terms.txt' instead."; if (outFile == null) { log.warn(msg); outputFile = DEFAULT_OUTPUT_FILE; } else { try { PrintWriter p = new PrintWriter(outFile); p.close(); outputFile = outFile; } catch (IOException ioe) { log.warn(msg); outputFile = DEFAULT_OUTPUT_FILE; } } } } /** * @param initParams map param accepting reference frequency file * @throws JATEException * @see AppParams#REFERENCE_FREQUENCY_FILE */ protected void initalizeRefFreqParam(Map<String, String> initParams) throws JATEException { if (initParams.containsKey(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey())) { String refFreqFilePath = initParams.get(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey()); if (refFreqFilePath == null) { String msg = String.format("Reference corpus frequency file %s is not set. A file path is expected.", AppParams.REFERENCE_FREQUENCY_FILE.getParamKey()); log.error(msg); throw new JATEException(msg); } File refFreqFile = new File(refFreqFilePath); if (!refFreqFile.exists()) { String msg = String.format("Excepted reference corpus frequency file %s does not exist in %s.", AppParams.REFERENCE_FREQUENCY_FILE.getParamKey(), refFreqFilePath); log.error(msg); throw new JATEException(msg); } this.referenceFrequencyFilePath = refFreqFilePath; } else { String msg = String.format("Reference corpus frequency file (-r) %s is not set. A file path is expected.", AppParams.REFERENCE_FREQUENCY_FILE.getParamKey()); log.error(msg); throw new JATEException(msg); } } /** * Rank and Filter terms candidates based on a given Solr index * <p> * This method assume that documents are indexed in the solr container (solrHomePath) * and term candidates have already been extracted at index-time. * <p> * jate properties provides necessary information needed by the ATE algorithm (e.g., text field, ngram info field, * term candiate field, cut-off threshold) * * @param core solr core * @param jatePropertyFile property file path, use the default one from classpath if not provided * @return List<JATETerm> the list of terms extracted * @throws IOException * @throws JATEException */ public abstract List<JATETerm> extract(SolrCore core, String jatePropertyFile) throws IOException, JATEException; /** * Rank and Filter terms candidates based on a given Solr index * <p> * This method assume that documents are indexed in the solr container (solrHomePath) * and term candidates have already been extracted at index-time. * <p> * jate properties provides necessary information needed by the ATE algorithm (e.g., text field, ngram info field, * term candiate field, cut-off threshold) * * @param solrHomePath solr core home directory path * @param coreName solr core name from where term recognition is executed * @param jatePropertyFile jate property file path * @return List<JATETerm> the list of terms extracted * @throws IOException * @throws JATEException */ public List<JATETerm> extract(String solrHomePath, String coreName, String jatePropertyFile) throws IOException, JATEException { EmbeddedSolrServer solrServer = null; SolrCore core = null; List<JATETerm> result = new ArrayList<JATETerm>(); try { solrServer = new EmbeddedSolrServer(Paths.get(solrHomePath), coreName); core = solrServer.getCoreContainer().getCore(coreName); result = extract(core, jatePropertyFile); // core.close(); // solrServer.close(); Iterator<JATETerm> it = result.iterator(); while(it.hasNext()){ JATETerm jt = it.next(); if(jt.getString().replaceAll("[^a-zA-Z0-9]","").length()==0) it.remove(); } return result; } finally { // try { if (solrServer != null) { // try { // solrServer.commit(); // } catch (SolrServerException e) { // log.error(e.toString()); // } if (core != null) { core.close(); } solrServer.close(); //workaround to avoid ERROR "CachingDirectoryFactory:150" solrServer.getCoreContainer().getAllCoreNames().forEach(currentCoreName -> { File lock = Paths.get(solrHomePath, currentCoreName, "data", "index", "write.lock").toFile(); if (lock.exists()) { lock.delete(); } }); } // if (solrServer != null) { // solrServer.commit(true, true); // Thread.sleep(5000); // // solrServer.getCoreContainer().shutdown(); // solrServer.close(); // } // } catch (Exception e) { // log.error("Unable to close solr index, error cause:"); // log.error(ExceptionUtils.getFullStackTrace(e)); // } } } /** * Corpus indexing and candidate extraction * * @param corpusDir corpus directory to be indexed, from where term candidate will be extracted * @param solrHomePath solr home path is the solr core container * @param coreName solr core name * @param jatePropertyFile JATE properties file */ public void index(Path corpusDir, Path solrHomePath, String coreName, String jatePropertyFile) throws JATEException { log.info(String.format("Indexing corpus from [%s] and perform candidate extraction ...", corpusDir)); List<Path> files = JATEUtil.loadFiles(corpusDir); log.info(" [" + files.size() + "] files are scanned and will be indexed and analysed."); final EmbeddedSolrServer solrServer = new EmbeddedSolrServer(solrHomePath, coreName); JATEProperties jateProp = getJateProperties(jatePropertyFile); try { files.stream().forEach(file -> { try { indexJATEDocuments(file, solrServer, jateProp, false); } catch (JATEException e) { e.printStackTrace(); } }); solrServer.commit(); log.info("all corpus are indexed with term candidates."); } catch (SolrServerException | IOException e) { throw new JATEException(String.format("Failed to index current corpus. Error:[%s]", e.toString())); } finally { try { // if (core != null) { // core.close(); // } // if (solrServer != null) { solrServer.close(); // } } catch (Exception e) { log.error("Unable to close solr index, error cause:"); log.error(ExceptionUtils.getFullStackTrace(e)); } } } protected void indexJATEDocuments(Path file, EmbeddedSolrServer solrServer, JATEProperties jateProp, boolean commit) throws JATEException { if (file == null) { return; } try { JATEDocument jateDocument = JATEUtil.loadJATEDocument(file); if (isNotEmpty(jateDocument)) JATEUtil.addNewDoc(solrServer, jateDocument.getId(), jateDocument.getId(), jateDocument.getContent(), jateProp, commit); } catch (FileNotFoundException ffe) { throw new JATEException(ffe.toString()); } catch (IOException ioe) { throw new JATEException(String.format("failed to index [%s]", file.toString()) + ioe.toString()); } catch (SolrServerException sse) { throw new JATEException(String.format("failed to index [%s] ", file.toString()) + sse.toString()); } } private static boolean isNotEmpty(JATEDocument jateDocument) { return jateDocument != null && jateDocument.getContent() != null && jateDocument.getContent().trim().length() != 0; } /** * Only effective under the Embedded mode. * <p> * User can choose to output term offset information. If this is the case, this method will be * called upon every final term. Iterating through the solr index can be slow so this method can * take some time. * * @param leafReader index reader * @param terms term list * @param ngramInfoFieldname indexed n-gram field, see 'jate_text_2_ngrams' field in example schema * @param idFieldname doc unique id field * @throws IOException */ public void collectTermOffsets(List<JATETerm> terms, LeafReader leafReader, String ngramInfoFieldname, String idFieldname) throws IOException { TermInfoCollector infoCollector = new TermInfoCollector(leafReader, ngramInfoFieldname, idFieldname); log.info("Gathering term information (e.g., provenance and offsets). This may take a while. Total=" + terms.size()); int count = 0; for (JATETerm jt : terms) { jt.setTermInfo(infoCollector.collect(jt.getString())); count++; if (count % 500 == 0) log.info("done " + count); } } /** * Add additional (indexed) term info into term list * * @param terms filtered term candidates * @param searcher solr index searcher * @param content2NgramField solr content to ngram TR aware field * @param idField solr unique id * @throws JATEException */ public void addAdditionalTermInfo(List<JATETerm> terms, SolrIndexSearcher searcher, String content2NgramField, String idField) throws JATEException { if (this.collectTermInfo) { try { collectTermOffsets(terms, searcher.getLeafReader(), content2NgramField, idField); } catch (IOException e) { throw new JATEException("I/O exception when reading Solr index. " + e.toString()); } } } /** * Term candidate filtering by total (whole index/corpus) term frequency * (exclusive) * * @param candidates term candidates * @throws JATEException */ protected void filterByTTF(List<String> candidates) throws JATEException { if (this.freqFeature == null) { throw new JATEException("FrequencyTermBased is not initialised for TTF term filtering."); } if (candidates == null || candidates.size() == 0) { return; } if (this.prefilterMinTTF != null) { log.debug(String.format("Filter [%s] term candidates by total term frequency [%s] (exclusive)", candidates.size(), this.prefilterMinTTF)); Iterator<String> it = candidates.iterator(); while (it.hasNext()) { String t = it.next(); if (this.freqFeature.getTTF(t) < prefilterMinTTF) it.remove(); } log.debug(String.format("filtered term candidate size: [%s]", candidates.size())); } } protected static Map<String, String> getParams(String[] args) { Map<String, String> params = new HashMap<>(); if (args.length < 3) { return params; } for (int i = 0; i < args.length; i++) { if (i == args.length - 2 || i == args.length - 1) { continue; } if (i + 1 < args.length) { String param = args[i]; String value = args[i + 1]; i++; params.put(param, value); } } return params; } public void write(List<JATETerm> terms) throws IOException { Gson gson = new Gson(); if (outputFile == null) { throw new IOException("Output file is null"); } else { log.info(String.format("Exporting terms to [%s]", outputFile)); Writer w = IOUtil.getUTF8Writer(outputFile); gson.toJson(terms, w); w.close(); log.info("complete."); } } /** * filter term candidates by cut-off threshold, top K or K% where applicable * * @param terms candidate terms to be filtered * @return List<JATETerm>, filtered terms */ protected List<JATETerm> cutoff(List<JATETerm> terms) { if (this.cutoffThreshold != null) { return cutoffByTermScoreThreshold(terms, this.cutoffThreshold); } else if (this.cutoffTopK != null) { return cutoffByTopK(terms, this.cutoffTopK); } else if (this.cutoffTopKPercent != null) { return cutoffByTopKPercent(terms, this.cutoffTopKPercent); } return terms; } /** * Filter term candidate list by termhood/unithood based threshold * (inclusive) * * @param terms a list of term candidates with term weight * @param cutOffThreshold term score measured by ATR algorithms * @return List<JATETerm> filtered terms */ protected List<JATETerm> cutoffByTermScoreThreshold(List<JATETerm> terms, Double cutOffThreshold) { List<JATETerm> weightedTerms = new ArrayList<>(); weightedTerms.addAll(terms); if (cutOffThreshold != null & weightedTerms.size() > 0) { log.debug(String.format("cutoff [%s] term candidates by termhood/unithood based threshold [%s]", weightedTerms.size(), cutOffThreshold)); Iterator<JATETerm> iterTerms = weightedTerms.iterator(); while (iterTerms.hasNext()) { if (iterTerms.next().getScore() < cutOffThreshold) iterTerms.remove(); } log.debug(String.format("final filtered term candidate size [%s]", terms.size())); } return weightedTerms; } /** * Filter term candidate list by top N (inclusive) terms * * @param terms terms ranked by term weight * @param topK top N term number * @return List<JATETerm> filtered terms */ protected List<JATETerm> cutoffByTopK(List<JATETerm> terms, Integer topK) { if (topK != null & terms != null & terms.size() > 0 & topK < terms.size()) { log.debug(String.format("cutoff [%s] term candidates by Top [%s] ...", terms.size(), topK)); terms = terms.subList(0, (topK + 1)); log.debug(String.format("final filtered term list size is [%s]", terms.size())); } return terms; } /** * Filter term candidate list by rounding top percentage of total term size * * @param terms weighted term list * @param topPercentage top percentage of weighted terms to be retained * @return List<JATETerm> filtered top K percent terms */ protected List<JATETerm> cutoffByTopKPercent(List<JATETerm> terms, Double topPercentage) { if (topPercentage != null & terms != null & terms.size() > 0) { log.debug(String.format("filter [%s] term candidates by Top [%s] percent (rounded) ...", terms.size(), topPercentage * 100)); Integer topN = (int) Math.round(topPercentage * terms.size()); if (topN > 0) terms = cutoffByTopK(terms, topN); log.debug(String.format("final filtered term list size is [%s]", terms.size())); } return terms; } protected static String getJATEProperties(Map<String, String> params) { if (params.containsKey(AppParams.JATE_PROPERTIES_FILE.getParamKey())) { return params.get(AppParams.JATE_PROPERTIES_FILE.getParamKey()); } return null; } protected static String getCorpusDir(Map<String, String> params) { if (params.containsKey(AppParams.CORPUS_DIR.getParamKey())) { return params.get(AppParams.CORPUS_DIR.getParamKey()); } return null; } /** * load JATE property file, if not provided (i.e., null), the file will be loaded from the default one. * @param jatePropertyFile jate property file path where the file will be loaded * @return JATEProperties object * @throws JATEException */ public static JATEProperties getJateProperties(String jatePropertyFile) throws JATEException { JATEProperties properties; if (jatePropertyFile != null && !jatePropertyFile.isEmpty()) { properties = new JATEProperties(jatePropertyFile); } else { properties = new JATEProperties(); } return properties; } protected static void printHelp() { StringBuilder sb = new StringBuilder("Usage:\n"); sb.append("java -cp '[CLASSPATH]' ").append(App.class.getName()).append(" ") .append("[OPTIONS] [SOLR_HOME_PATH] [SOLR_CORE_NAME] ").append("\n\n"); sb.append("Example: java -cp '/libs/*' /corpus/ /solr/server/solr jate -prop jate.properties -cf.k 20 ...\n\n"); sb.append("[OPTIONS]:\n") .append("\t\t-corpusDir\t\t. The corpus to be indexed, from where term candidate will be extracted, ranked and weighted.") .append("\t\t-prop\t\t. jate.properties file for the configuration of Solr schema.") .append("\t\t-c\t\t'true' or 'false'. Whether to collect term information for exporting, e.g., offsets in documents. Default is false.\n") .append("\t\t-r\t\t. Reference corpus frequency file path (-r) is required by AppGlossEx, AppTermEx and AppWeirdness.\n") .append("\t\t-cf.t\t\tA number. Cutoff score threshold for selecting terms. If multiple -cf.* parameters are set the preference order will be cf.t, cf.k, cf.kp.") .append("\n") .append("\t\t-cf.k\t\tA number. Cutoff top ranked K terms to be selected. If multiple -cf.* parameters are set the preference order will be cf.t, cf.k, cf.kp.") .append("\n") .append("\t\t-cf.kp\t\tA number. Cutoff top ranked K% terms to be selected. If multiple -cf.* parameters are set the preference order will be cf.t, cf.k, cf.kp.") .append("\n") .append("\t\t-pf.mttf\t\tA number. Pre-filter minimum total term frequency. \n") .append("\t\t-pf.mtcf\t\tA number. Pre-filter minimum context frequency of a term (used by co-occurrence based methods). \n") .append("\t\t-o\t\tA file path to save output. \n"); System.out.println(sb); } }