App.java example

Explorer

jat-master
- jate-master
  - src
    - main
      - java
        org
        apache
        lucene
        analysis
        jate
        ComplexShingleFilter.java
        ComplexShingleFilterFactory.java
        EnglishLemmatisationFilter.java
        EnglishLemmatisationFilterFactory.java
        MWEFeatureFilter.java
        MWEFeatureFilterFactory.java
        MWEFilter.java
        MWEFilterFactory.java
        MWEMetadata.java
        MWEMetadataType.java
        OpenNLPMWEFilter.java
        OpenNLPNounPhraseFilter.java
        OpenNLPNounPhraseFilterFactory.java
        OpenNLPPOSTaggerFactory.java
        OpenNLPPOSTaggerFilter.java
        OpenNLPRegexChunker.java
        OpenNLPRegexChunkerFactory.java
        OpenNLPTokenizer.java
        OpenNLPTokenizerFactory.java
        Paragraph.java
        ParagraphChunker.java
        ParagraphChunkerRawText.java
        PunctuationRemover.java
        PunctuationRemoverFactory.java
        SentenceContext.java
        SentenceContextAware.java
        WordShapeTagger.java
        uk
        ac
        shef
        dcs
        jate
        JATEException.java
        JATEProperties.java
        JATERecursiveTaskWorker.java
        algorithm
        ATTF.java
        Algorithm.java
        CValue.java
        CValueWorker.java
        ChiSquare.java
        ChiSquareWorker.java
        GlossEx.java
        RAKE.java
        RAKEWorker.java
        RIDF.java
        ReferenceBased.java
        TFIDF.java
        TTF.java
        TermEx.java
        TermInfoCollector.java
        Weirdness.java
        app
        App.java
        AppATTF.java
        AppCValue.java
        AppChiSquare.java
        AppGlossEx.java
        AppParams.java
        AppRAKE.java
        AppRIDF.java
        AppTFIDF.java
        AppTTF.java
        AppTermEx.java
        AppWeirdness.java
        Indexing.java
        Voting.java
        eval
        ATEResultLoader.java
        GSLoader.java
        Scorer.java
        feature
        AbstractFeature.java
        AbstractFeatureBuilder.java
        ChiSquareFrequentTerms.java
        ChiSquareFrequentTermsFBMaster.java
        ChiSquareFrequentTermsFBWorker.java
        Containment.java
        ContainmentFBMaster.java
        ContainmentFBWorker.java
        ContextOverlap.java
        ContextWindow.java
        Cooccurrence.java
        CooccurrenceFBMaster.java
        CooccurrenceFBWorker.java
        FrequencyCtxBased.java
        FrequencyCtxBasedCopier.java
        FrequencyCtxDocBasedFBMaster.java
        FrequencyCtxDocBasedFBWorker.java
        FrequencyCtxSentenceBasedFBMaster.java
        FrequencyCtxSentenceBasedFBWorker.java
        FrequencyCtxWindowBasedFBMaster.java
        FrequencyCtxWindowBasedFBWorker.java
        FrequencyTermBased.java
        FrequencyTermBasedFBMaster.java
        FrequencyTermBasedFBWorker.java
        PositionFeature.java
        PositionFeatureMaster.java
        PositionFeatureWorker.java
        TTFReferenceFeatureFileBuilder.java
        TermComponentIndex.java
        TermComponentIndexFBMaster.java
        TermComponentIndexFBWorker.java
        WordShapeFBMaster.java
        WordShapeFBWorker.java
        WordShapeFeature.java
        indexing
        IndexingHandler.java
        io
        CSVFileOutputReader.java
        ContentExtractor.java
        DocumentCreator.java
        FileBasedOutputWriter.java
        FileOutputReader.java
        JSONFileOutputReader.java
        TikaMultiFieldDocumentCreator.java
        TikaSimpleDocumentCreator.java
        model
        JATEDocument.java
        JATETerm.java
        TermInfo.java
        nlp
        Chunker.java
        InstanceCreator.java
        Lemmatiser.java
        POSTagger.java
        SentenceSplitter.java
        opennlp
        ChunkerOpenNLP.java
        POSTaggerOpenNLP.java
        SentenceSplitterOpenNLP.java
        solr
        ATTFProcessor.java
        CValueProcessor.java
        ChiSquareProcessor.java
        CompositeTermRecognitionProcessor.java
        GlossExProcessor.java
        RAKEProcessor.java
        RIDFProcessor.java
        TFIDFProcessor.java
        TTFProcessor.java
        TermExProcessor.java
        TermRecognitionProcessor.java
        TermRecognitionProcessorFactory.java
        TermRecognitionRequestHandler.java
        WeirdnessProcessor.java
        util
        ACLRDCorpusParser.java
        GENIACorpusParser.java
        IOUtil.java
        JATEUtil.java
        RegressionFeatureGenerator.java
        ScienceIECorpusParser.java
        SolrUtil.java
    - test
      - java
        uk
        ac
        shef
        dcs
        jate
        app
        ACLRDTECTest.java
        AppATEACLRDTECTest.java
        AppATEGENIATest.java
        BaseEmbeddedSolrTest.java
        TestRAKE.java
        nlp
        opennlp
        SentenceSplitterOpenNLPTest.java
        util
        JATEUtilTest.java

package uk.ac.shef.dcs.jate.app;

import com.google.gson.Gson;

import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.lucene.index.LeafReader;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SolrIndexSearcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.algorithm.TermInfoCollector;
import uk.ac.shef.dcs.jate.feature.FrequencyTermBased;
import uk.ac.shef.dcs.jate.feature.FrequencyTermBasedFBMaster;
import uk.ac.shef.dcs.jate.model.JATEDocument;
import uk.ac.shef.dcs.jate.model.JATETerm;
import uk.ac.shef.dcs.jate.util.IOUtil;
import uk.ac.shef.dcs.jate.util.JATEUtil;

import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;


public abstract class App {

    private final Logger log = LoggerFactory.getLogger(getClass());


    /**
     * corresponding to "-c" in command line
     * <p>
     * 'true' or 'false'. Whether to collect offsets of term occurrences in the corpus
     * and save in the output. Default is false.
     */
    protected Boolean collectTermInfo = false;

    /**
     * Three cutoff options to seperate real terms from non-terms. All values are inclusive
     *
     */
    /**
     * a cut off score
     */
    protected Double cutoffThreshold = null;

    /**
     * select highest ranked K
     */
    protected Integer cutoffTopK = null;

    /**
     * select highst ranked K%
     */
    protected Double cutoffTopKPercent = null;

    /**
     * corresponding to "-o" in command line
     * <p>
     * file path. If provided, the output list of terms is written to the file. Otherwise,
     * output is written to the console.
     */
    protected String outputFile = null;


    // Min total fequency of a term
    protected Integer prefilterMinTTF = 0;

    // Min frequency of a term appearing in different context
    protected Integer prefilterMinTCF = 0;

    //used by algorithms such as weirdness, glossex, termex that compares against a reference corpus
    protected String referenceFrequencyFilePath = null;


    protected FrequencyTermBasedFBMaster freqFeatureBuilder = null;
    // term indexed feature (typically frequency info.)
    // see also {@code AppATTF}
    protected FrequencyTermBased freqFeature = null;

    private static String DEFAULT_OUTPUT_FILE = "terms.txt";

    public App() {
    }

    protected static boolean isExport(Map<String, String> params) {
        return params.containsKey(AppParams.OUTPUT_FILE.getParamKey());
    }

    /**
     * if corpus provided, perform indexing first and then ranking & filtering
     *
     * @param corpusDir
     * @return true if corpus is provided otherwise false
     */
    protected static boolean isCorpusProvided(String corpusDir) {
        return corpusDir != null && !corpusDir.isEmpty();
    }

    private int parseIntParam(String name, String value) throws JATEException {
        try {
            return Integer.parseInt(value);
        } catch (NumberFormatException nfe) {
            String msg = String.format("%s is not set correctly. An integer value is expected. " +
                    "Actual input is %s", name, value);
            log.error(msg);
            throw new JATEException(msg);
        }
    }

    private double parseDoubleParam(String name, String value) throws JATEException {
        try {
            return Double.parseDouble(value);
        } catch (NumberFormatException nfe) {
            String msg = String.format("%s is not set correctly. An integer value is expected. " +
                    "Actual input is %s", name, value);
            log.error(msg);
            throw new JATEException(msg);
        }
    }


    /**
     * Initialise common run-time parameters
     *
     * @param params, command line run-time parameters (paramKey, value) for term
     *                ranking algorithms
     * @throws JATEException
     * @see AppParams
     * @see AppParams
     */
    App(Map<String, String> params) throws JATEException {
        if (params.containsKey(AppParams.CUTOFF_TOP_K.getParamKey())) {
            String topKSetting = params.get(AppParams.CUTOFF_TOP_K.getParamKey());
            this.cutoffTopK = parseIntParam("Cutoff parameter Top K " + AppParams.CUTOFF_TOP_K.getParamKey()
                    , topKSetting);
            log.debug(String.format("Cutoff parameter: top [%s] term candidates will be selected as final terms", topKSetting));
        }

        if (params.containsKey(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey())) {
            String topPercSetting = params.get(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey());
            this.cutoffTopKPercent = parseDoubleParam("Cutoff parameter Top K% " + AppParams.CUTOFF_TOP_K_PERCENT.getParamKey()
                    , topPercSetting);
            log.debug(String.format("Cutoff parameter: top [%s] percent of term candidates will be selected as final terms", topPercSetting));
        }

        if (params.containsKey(AppParams.CUTOFF_THRESHOLD.getParamKey())) {
            String cutOffThreshold = params.get(AppParams.CUTOFF_THRESHOLD.getParamKey());
            this.cutoffThreshold = parseDoubleParam("Cutoff parameter term score " + AppParams.CUTOFF_THRESHOLD.
                    getParamKey(), cutOffThreshold);
            log.debug(String.format("Cutoff paramter: terms with a minimum score of [%s] will be selected as final terms", cutOffThreshold));
        }

        if (params.containsKey(AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY.getParamKey())) {
            String minTCF = params.get(AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY.getParamKey());
            this.prefilterMinTCF = parseIntParam("Pre-filter minimum term context frequency " +
                    AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY, minTCF);
            log.debug(String.format("Pre-filter mininum term context frequency (used by co-occurrence based methods) is set to [%s]", prefilterMinTCF));
        }


        if (params.containsKey(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey())) {
            String minTTF = params.get(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey());
            this.prefilterMinTTF = parseIntParam("Pre-filter minimum total term frequency " +
                    AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY, minTTF);
            log.debug(String.format("Pre-filter mininum total term frequency is set to [%s]", prefilterMinTCF));
        }


        if (params.containsKey(AppParams.COLLECT_TERM_INFO.getParamKey())) {
            String collectTermOffsets = params.get(AppParams.COLLECT_TERM_INFO.getParamKey());
            if (collectTermOffsets != null && collectTermOffsets.equalsIgnoreCase("true")) {
                this.collectTermInfo = true;
                log.debug("Term offsets will be collected and written to the output");
            }
        }

        if (params.containsKey(AppParams.OUTPUT_FILE.getParamKey())) {
            String outFile = params.get(AppParams.OUTPUT_FILE.getParamKey());

            String msg = "Output file is missing or its path is invalid (you can ignore this if you are running " +
                    "in the Plugin mode and do not require the list of terms to be exported to a file.) \n" +
                    "Output will be written to a default file 'terms.txt' instead.";
            if (outFile == null) {
                log.warn(msg);
                outputFile = DEFAULT_OUTPUT_FILE;
            } else {
                try {
                    PrintWriter p = new PrintWriter(outFile);
                    p.close();
                    outputFile = outFile;
                } catch (IOException ioe) {
                    log.warn(msg);
                    outputFile = DEFAULT_OUTPUT_FILE;
                }
            }

        }

    }

    /**
     * @param initParams  map param accepting reference frequency file
     * @throws JATEException
     * @see AppParams#REFERENCE_FREQUENCY_FILE
     */
    protected void initalizeRefFreqParam(Map<String, String> initParams) throws JATEException {
        if (initParams.containsKey(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey())) {
            String refFreqFilePath = initParams.get(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey());

            if (refFreqFilePath == null) {
                String msg = String.format("Reference corpus frequency file %s is not set. A file path is expected.",
                        AppParams.REFERENCE_FREQUENCY_FILE.getParamKey());

                log.error(msg);
                throw new JATEException(msg);
            }

            File refFreqFile = new File(refFreqFilePath);
            if (!refFreqFile.exists()) {
                String msg = String.format("Excepted reference corpus frequency file %s does not exist in %s.",
                        AppParams.REFERENCE_FREQUENCY_FILE.getParamKey(),
                        refFreqFilePath);
                log.error(msg);
                throw new JATEException(msg);
            }

            this.referenceFrequencyFilePath = refFreqFilePath;
        } else {
            String msg = String.format("Reference corpus frequency file (-r) %s is not set. A file path is expected.",
                    AppParams.REFERENCE_FREQUENCY_FILE.getParamKey());
            log.error(msg);
            throw new JATEException(msg);
        }
    }


    /**
     * Rank and Filter terms candidates based on a given Solr index
     * <p>
     * This method assume that documents are indexed in the solr container (solrHomePath)
     * and term candidates have already been extracted at index-time.
     * <p>
     * jate properties provides necessary information needed by the ATE algorithm (e.g., text field, ngram info field,
     * term candiate field, cut-off threshold)
     *
     * @param core           solr core
     * @param jatePropertyFile  property file path, use the default one from classpath if not provided
     * @return List<JATETerm>  the list of terms extracted
     * @throws IOException
     * @throws JATEException
     */
    public abstract List<JATETerm> extract(SolrCore core, String jatePropertyFile) throws IOException, JATEException;

    /**
     * Rank and Filter terms candidates based on a given Solr index
     * <p>
     * This method assume that documents are indexed in the solr container (solrHomePath)
     * and term candidates have already been extracted at index-time.
     * <p>
     * jate properties provides necessary information needed by the ATE algorithm (e.g., text field, ngram info field,
     * term candiate field, cut-off threshold)
     *
     * @param solrHomePath     solr core home directory path
     * @param coreName         solr core name from where term recognition is executed
     * @param jatePropertyFile jate property file path
     * @return List<JATETerm> the list of terms extracted
     * @throws IOException
     * @throws JATEException
     */
    public List<JATETerm> extract(String solrHomePath, String coreName, String jatePropertyFile)
            throws IOException, JATEException {
        EmbeddedSolrServer solrServer = null;
        SolrCore core = null;
        List<JATETerm> result = new ArrayList<JATETerm>();

        try {
            solrServer = new EmbeddedSolrServer(Paths.get(solrHomePath), coreName);
            core = solrServer.getCoreContainer().getCore(coreName);
            result = extract(core, jatePropertyFile);
            
//            core.close();
//            solrServer.close();
            Iterator<JATETerm> it = result.iterator();
            while(it.hasNext()){
                JATETerm jt = it.next();
                if(jt.getString().replaceAll("[^a-zA-Z0-9]","").length()==0)
                    it.remove();
            }
            return result;
        } finally {
//            try {
                if (solrServer != null) {

//                    try {
//						solrServer.commit();
//					} catch (SolrServerException e) {
//						log.error(e.toString());
//					}

                    if (core != null) {
                        core.close();
                    }

                    solrServer.close();
                    //workaround to avoid ERROR "CachingDirectoryFactory:150"
                    solrServer.getCoreContainer().getAllCoreNames().forEach(currentCoreName -> {
                        File lock = Paths.get(solrHomePath, currentCoreName, "data", "index", "write.lock").toFile();
                        if (lock.exists()) {
                            lock.delete();
                        }
                    });
                }
//                if (solrServer != null) {
//                    solrServer.commit(true, true);
//                    Thread.sleep(5000);
//
//                    solrServer.getCoreContainer().shutdown();
//                    solrServer.close();
//                }
//            } catch (Exception e) {
//                log.error("Unable to close solr index, error cause:");
//                log.error(ExceptionUtils.getFullStackTrace(e));
//            }
        }        
    }

    /**
     * Corpus indexing and candidate extraction
     *
     * @param corpusDir  corpus directory to be indexed, from where term candidate will be extracted
     * @param solrHomePath   solr home path is the solr core container
     * @param coreName   solr core name
     * @param jatePropertyFile   JATE properties file
     */
    public void index(Path corpusDir, Path solrHomePath, String coreName, String jatePropertyFile)
            throws JATEException {
        log.info(String.format("Indexing corpus from [%s] and perform candidate extraction ...", corpusDir));

        List<Path> files = JATEUtil.loadFiles(corpusDir);
        log.info(" [" + files.size() + "] files are scanned and will be indexed and analysed.");

        final EmbeddedSolrServer solrServer = new EmbeddedSolrServer(solrHomePath, coreName);
        
        JATEProperties jateProp = getJateProperties(jatePropertyFile);

        try {
            files.stream().forEach(file -> {
                try {
                    indexJATEDocuments(file, solrServer, jateProp, false);
                } catch (JATEException e) {
                    e.printStackTrace();
                }
            });

            solrServer.commit();
            log.info("all corpus are indexed with term candidates.");
        } catch (SolrServerException | IOException e) {
            throw new JATEException(String.format("Failed to index current corpus. Error:[%s]", e.toString()));
        } finally {
            try {
//                if (core != null) {
//                    core.close();
//                }
//                if (solrServer != null) {
                solrServer.close();
//                }
            } catch (Exception e) {
                log.error("Unable to close solr index, error cause:");
                log.error(ExceptionUtils.getFullStackTrace(e));
            }
        }
    }

    protected void indexJATEDocuments(Path file, EmbeddedSolrServer solrServer, JATEProperties jateProp, boolean commit) throws JATEException {
        if (file == null) {
            return;
        }

        try {
            JATEDocument jateDocument = JATEUtil.loadJATEDocument(file);

            if (isNotEmpty(jateDocument))
                JATEUtil.addNewDoc(solrServer, jateDocument.getId(),
                        jateDocument.getId(), jateDocument.getContent(), jateProp, commit);
        } catch (FileNotFoundException ffe) {
            throw new JATEException(ffe.toString());
        } catch (IOException ioe) {
            throw new JATEException(String.format("failed to index [%s]", file.toString()) + ioe.toString());
        } catch (SolrServerException sse) {
            throw new JATEException(String.format("failed to index [%s] ", file.toString()) + sse.toString());
        }
    }

    private static boolean isNotEmpty(JATEDocument jateDocument) {
        return jateDocument != null &&
                jateDocument.getContent() != null &&
                jateDocument.getContent().trim().length() != 0;
    }

    /**
     * Only effective under the Embedded mode.
     * <p>
     * User can choose to output term offset information. If this is the case, this method will be
     * called upon every final term. Iterating through the solr index can be slow so this method can
     * take some time.
     *
     * @param leafReader         index reader
     * @param terms              term list
     * @param ngramInfoFieldname indexed n-gram field, see 'jate_text_2_ngrams' field in example schema
     * @param idFieldname        doc unique id field
     * @throws IOException
     */
    public void collectTermOffsets(List<JATETerm> terms, LeafReader leafReader, String ngramInfoFieldname,
                                   String idFieldname) throws IOException {
        TermInfoCollector infoCollector = new TermInfoCollector(leafReader, ngramInfoFieldname, idFieldname);

        log.info("Gathering term information (e.g., provenance and offsets). This may take a while. Total="
                + terms.size());
        int count = 0;
        for (JATETerm jt : terms) {
            jt.setTermInfo(infoCollector.collect(jt.getString()));
            count++;
            if (count % 500 == 0)
                log.info("done " + count);
        }
    }

    /**
     * Add additional (indexed) term info into term list
     *
     * @param terms              filtered term candidates
     * @param searcher           solr index searcher
     * @param content2NgramField solr content to ngram TR aware field
     * @param idField            solr unique id
     * @throws JATEException
     */
    public void addAdditionalTermInfo(List<JATETerm> terms, SolrIndexSearcher searcher, String content2NgramField,
                                      String idField) throws JATEException {
        if (this.collectTermInfo) {
            try {
                collectTermOffsets(terms, searcher.getLeafReader(), content2NgramField, idField);
            } catch (IOException e) {
                throw new JATEException("I/O exception when reading Solr index. " + e.toString());
            }
        }
    }

    /**
     * Term candidate filtering by total (whole index/corpus) term frequency
     * (exclusive)
     *
     * @param candidates  term candidates
     * @throws JATEException
     */
    protected void filterByTTF(List<String> candidates) throws JATEException {
        if (this.freqFeature == null) {
            throw new JATEException("FrequencyTermBased is not initialised for TTF term filtering.");
        }

        if (candidates == null || candidates.size() == 0) {
            return;
        }

        if (this.prefilterMinTTF != null) {
            log.debug(String.format("Filter [%s] term candidates by total term frequency [%s] (exclusive)",
                    candidates.size(), this.prefilterMinTTF));
            Iterator<String> it = candidates.iterator();
            while (it.hasNext()) {
                String t = it.next();
                if (this.freqFeature.getTTF(t) < prefilterMinTTF)
                    it.remove();
            }
            log.debug(String.format("filtered term candidate size: [%s]", candidates.size()));
        }
    }

    protected static Map<String, String> getParams(String[] args) {
        Map<String, String> params = new HashMap<>();
        if (args.length < 3) {
            return params;
        }
        for (int i = 0; i < args.length; i++) {
            if (i == args.length - 2 || i == args.length - 1) {
                continue;
            }

            if (i + 1 < args.length) {
                String param = args[i];
                String value = args[i + 1];
                i++;
                params.put(param, value);
            }
        }
        return params;
    }


    public void write(List<JATETerm> terms) throws IOException {
        Gson gson = new Gson();
        if (outputFile == null) {
            throw new IOException("Output file is null");
        } else {
            log.info(String.format("Exporting terms to [%s]", outputFile));
            Writer w = IOUtil.getUTF8Writer(outputFile);
            gson.toJson(terms, w);
            w.close();
            log.info("complete.");
        }
    }

    /**
     * filter term candidates by cut-off threshold, top K or K% where applicable
     *
     * @param terms  candidate terms to be filtered
     * @return List<JATETerm>, filtered terms
     */
    protected List<JATETerm> cutoff(List<JATETerm> terms) {
        if (this.cutoffThreshold != null) {
            return cutoffByTermScoreThreshold(terms, this.cutoffThreshold);
        } else if (this.cutoffTopK != null) {
            return cutoffByTopK(terms, this.cutoffTopK);
        } else if (this.cutoffTopKPercent != null) {
            return cutoffByTopKPercent(terms, this.cutoffTopKPercent);
        }

        return terms;
    }

    /**
     * Filter term candidate list by termhood/unithood based threshold
     * (inclusive)
     *
     * @param terms            a list of term candidates with term weight
     * @param cutOffThreshold  term score measured by ATR algorithms
     * @return List<JATETerm>  filtered terms
     */
    protected List<JATETerm> cutoffByTermScoreThreshold(List<JATETerm> terms, Double cutOffThreshold) {
        List<JATETerm> weightedTerms = new ArrayList<>();
        weightedTerms.addAll(terms);

        if (cutOffThreshold != null & weightedTerms.size() > 0) {
            log.debug(String.format("cutoff [%s] term candidates by termhood/unithood based threshold [%s]",
                    weightedTerms.size(), cutOffThreshold));

            Iterator<JATETerm> iterTerms = weightedTerms.iterator();
            while (iterTerms.hasNext()) {
                if (iterTerms.next().getScore() < cutOffThreshold)
                    iterTerms.remove();
            }
            log.debug(String.format("final filtered term candidate size [%s]", terms.size()));
        }
        return weightedTerms;
    }

    /**
     * Filter term candidate list by top N (inclusive) terms
     *
     * @param terms  terms ranked by term weight
     * @param topK   top N term number
     * @return List<JATETerm>  filtered terms
     */
    protected List<JATETerm> cutoffByTopK(List<JATETerm> terms, Integer topK) {
        if (topK != null & terms != null & terms.size() > 0 & topK < terms.size()) {
            log.debug(String.format("cutoff [%s] term candidates by Top [%s] ...", terms.size(), topK));
            terms = terms.subList(0, (topK + 1));
            log.debug(String.format("final filtered term list size is [%s]", terms.size()));
        }
        return terms;
    }

    /**
     * Filter term candidate list by rounding top percentage of total term size
     *
     * @param terms  weighted term list
     * @param topPercentage  top percentage of weighted terms to be retained
     * @return List<JATETerm>  filtered top K percent terms
     */
    protected List<JATETerm> cutoffByTopKPercent(List<JATETerm> terms, Double topPercentage) {
        if (topPercentage != null & terms != null & terms.size() > 0) {
            log.debug(String.format("filter [%s] term candidates by Top [%s] percent (rounded) ...",
                    terms.size(),
					topPercentage * 100));
            Integer topN = (int) Math.round(topPercentage * terms.size());
            if (topN > 0)
                terms = cutoffByTopK(terms, topN);
            log.debug(String.format("final filtered term list size is [%s]", terms.size()));
        }
        return terms;
    }

    protected static String getJATEProperties(Map<String, String> params) {
        if (params.containsKey(AppParams.JATE_PROPERTIES_FILE.getParamKey())) {
            return params.get(AppParams.JATE_PROPERTIES_FILE.getParamKey());
        }
        return null;
    }

    protected static String getCorpusDir(Map<String, String> params) {
        if (params.containsKey(AppParams.CORPUS_DIR.getParamKey())) {
            return params.get(AppParams.CORPUS_DIR.getParamKey());
        }
        return null;
    }

    /**
     * load JATE property file, if not provided (i.e., null), the file will be loaded from the default one.
     * @param jatePropertyFile  jate property file path where the file will be loaded
     * @return JATEProperties object
     * @throws JATEException
     */
    public static JATEProperties getJateProperties(String jatePropertyFile) throws JATEException {
        JATEProperties properties;
        if (jatePropertyFile != null && !jatePropertyFile.isEmpty()) {
            properties = new JATEProperties(jatePropertyFile);
        } else {
            properties = new JATEProperties();
        }
        return properties;
    }

    protected static void printHelp() {
        StringBuilder sb = new StringBuilder("Usage:\n");
        sb.append("java -cp '[CLASSPATH]' ").append(App.class.getName()).append(" ")
                .append("[OPTIONS] [SOLR_HOME_PATH] [SOLR_CORE_NAME] ").append("\n\n");
        sb.append("Example: java -cp '/libs/*' /corpus/ /solr/server/solr jate  -prop jate.properties -cf.k 20  ...\n\n");
        sb.append("[OPTIONS]:\n")
                .append("\t\t-corpusDir\t\t. The corpus to be indexed, from where term candidate will be extracted, ranked and weighted.")
                .append("\t\t-prop\t\t. jate.properties file for the configuration of Solr schema.")
                .append("\t\t-c\t\t'true' or 'false'. Whether to collect term information for exporting, e.g., offsets in documents. Default is false.\n")
                .append("\t\t-r\t\t. Reference corpus frequency file path (-r) is required by AppGlossEx, AppTermEx and AppWeirdness.\n")
                .append("\t\t-cf.t\t\tA number. Cutoff score threshold for selecting terms. If multiple -cf.* parameters are set the preference order will be cf.t, cf.k, cf.kp.")
                .append("\n")
                .append("\t\t-cf.k\t\tA number. Cutoff top ranked K terms to be selected. If multiple -cf.* parameters are set the preference order will be cf.t, cf.k, cf.kp.")
                .append("\n")
                .append("\t\t-cf.kp\t\tA number. Cutoff top ranked K% terms to be selected. If multiple -cf.* parameters are set the preference order will be cf.t, cf.k, cf.kp.")
                .append("\n")
                .append("\t\t-pf.mttf\t\tA number. Pre-filter minimum total term frequency. \n")
                .append("\t\t-pf.mtcf\t\tA number. Pre-filter minimum context frequency of a term (used by co-occurrence based methods). \n")

                .append("\t\t-o\t\tA file path to save output. \n");
        System.out.println(sb);
    }
}