ACLRDTECTest.java example

Explorer

jat-master
- jate-master
  - src
    - main
      - java
        org
        apache
        lucene
        analysis
        jate
        ComplexShingleFilter.java
        ComplexShingleFilterFactory.java
        EnglishLemmatisationFilter.java
        EnglishLemmatisationFilterFactory.java
        MWEFeatureFilter.java
        MWEFeatureFilterFactory.java
        MWEFilter.java
        MWEFilterFactory.java
        MWEMetadata.java
        MWEMetadataType.java
        OpenNLPMWEFilter.java
        OpenNLPNounPhraseFilter.java
        OpenNLPNounPhraseFilterFactory.java
        OpenNLPPOSTaggerFactory.java
        OpenNLPPOSTaggerFilter.java
        OpenNLPRegexChunker.java
        OpenNLPRegexChunkerFactory.java
        OpenNLPTokenizer.java
        OpenNLPTokenizerFactory.java
        Paragraph.java
        ParagraphChunker.java
        ParagraphChunkerRawText.java
        PunctuationRemover.java
        PunctuationRemoverFactory.java
        SentenceContext.java
        SentenceContextAware.java
        WordShapeTagger.java
        uk
        ac
        shef
        dcs
        jate
        JATEException.java
        JATEProperties.java
        JATERecursiveTaskWorker.java
        algorithm
        ATTF.java
        Algorithm.java
        CValue.java
        CValueWorker.java
        ChiSquare.java
        ChiSquareWorker.java
        GlossEx.java
        RAKE.java
        RAKEWorker.java
        RIDF.java
        ReferenceBased.java
        TFIDF.java
        TTF.java
        TermEx.java
        TermInfoCollector.java
        Weirdness.java
        app
        App.java
        AppATTF.java
        AppCValue.java
        AppChiSquare.java
        AppGlossEx.java
        AppParams.java
        AppRAKE.java
        AppRIDF.java
        AppTFIDF.java
        AppTTF.java
        AppTermEx.java
        AppWeirdness.java
        Indexing.java
        Voting.java
        eval
        ATEResultLoader.java
        GSLoader.java
        Scorer.java
        feature
        AbstractFeature.java
        AbstractFeatureBuilder.java
        ChiSquareFrequentTerms.java
        ChiSquareFrequentTermsFBMaster.java
        ChiSquareFrequentTermsFBWorker.java
        Containment.java
        ContainmentFBMaster.java
        ContainmentFBWorker.java
        ContextOverlap.java
        ContextWindow.java
        Cooccurrence.java
        CooccurrenceFBMaster.java
        CooccurrenceFBWorker.java
        FrequencyCtxBased.java
        FrequencyCtxBasedCopier.java
        FrequencyCtxDocBasedFBMaster.java
        FrequencyCtxDocBasedFBWorker.java
        FrequencyCtxSentenceBasedFBMaster.java
        FrequencyCtxSentenceBasedFBWorker.java
        FrequencyCtxWindowBasedFBMaster.java
        FrequencyCtxWindowBasedFBWorker.java
        FrequencyTermBased.java
        FrequencyTermBasedFBMaster.java
        FrequencyTermBasedFBWorker.java
        PositionFeature.java
        PositionFeatureMaster.java
        PositionFeatureWorker.java
        TTFReferenceFeatureFileBuilder.java
        TermComponentIndex.java
        TermComponentIndexFBMaster.java
        TermComponentIndexFBWorker.java
        WordShapeFBMaster.java
        WordShapeFBWorker.java
        WordShapeFeature.java
        indexing
        IndexingHandler.java
        io
        CSVFileOutputReader.java
        ContentExtractor.java
        DocumentCreator.java
        FileBasedOutputWriter.java
        FileOutputReader.java
        JSONFileOutputReader.java
        TikaMultiFieldDocumentCreator.java
        TikaSimpleDocumentCreator.java
        model
        JATEDocument.java
        JATETerm.java
        TermInfo.java
        nlp
        Chunker.java
        InstanceCreator.java
        Lemmatiser.java
        POSTagger.java
        SentenceSplitter.java
        opennlp
        ChunkerOpenNLP.java
        POSTaggerOpenNLP.java
        SentenceSplitterOpenNLP.java
        solr
        ATTFProcessor.java
        CValueProcessor.java
        ChiSquareProcessor.java
        CompositeTermRecognitionProcessor.java
        GlossExProcessor.java
        RAKEProcessor.java
        RIDFProcessor.java
        TFIDFProcessor.java
        TTFProcessor.java
        TermExProcessor.java
        TermRecognitionProcessor.java
        TermRecognitionProcessorFactory.java
        TermRecognitionRequestHandler.java
        WeirdnessProcessor.java
        util
        ACLRDCorpusParser.java
        GENIACorpusParser.java
        IOUtil.java
        JATEUtil.java
        RegressionFeatureGenerator.java
        ScienceIECorpusParser.java
        SolrUtil.java
    - test
      - java
        uk
        ac
        shef
        dcs
        jate
        app
        ACLRDTECTest.java
        AppATEACLRDTECTest.java
        AppATEGENIATest.java
        BaseEmbeddedSolrTest.java
        TestRAKE.java
        nlp
        opennlp
        SentenceSplitterOpenNLPTest.java
        util
        JATEUtilTest.java

package uk.ac.shef.dcs.jate.app;

import dragon.nlp.tool.lemmatiser.EngLemmatiser;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.CoreContainer;
import org.junit.Assert;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.eval.ATEResultLoader;
import uk.ac.shef.dcs.jate.eval.GSLoader;
import uk.ac.shef.dcs.jate.eval.Scorer;
import uk.ac.shef.dcs.jate.model.JATEDocument;
import uk.ac.shef.dcs.jate.model.JATETerm;
import uk.ac.shef.dcs.jate.nlp.Lemmatiser;
import uk.ac.shef.dcs.jate.util.JATEUtil;

import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;

/**
 * Tests for App* of a set of ATE algorithms & The ACL RD-TEC based benchmarking test based on Embedded Solr.
 * <p>
 * ACL RD-TEC contains almost 11,000 scientific publications. Due to the performance and efficiency reasons,
 * the test and benchmarking tests can only be run manually.
 *
 * @see AppATEACLRDTECTest
 * @see <p>
 * The ACL RD-TEC stands for The ACL Reference Dataset for Terminology Extraction and Classification.
 * @see <a href="http://catalog.elra.info/product_info.php?products_id=1236">European Language Resources Association</a>
 * @see <a href="http://acl-arc.comp.nus.edu.sg/">ACL Anthology Reference Corpus (ACL ARC)</a>
 * @see <a href="http://atmykitchen.info/datasets/acl_rd_tec/">The dataset</a>
 * <p>
 * Raw text files can be downloaded via
 * <a href="Resources from ACL ARC">
 * http://atmykitchen.info/datasets/acl_rd_tec/external_resource/index_external_resource.htm</a>
 * <p>
 * For more details about ACL RD-TEC corpus, please refer to the paper:
 * <p>
 * Zadeh, B. Q., & Handschuh, S. (2014).
 * The ACL RD-TEC: a dataset for benchmarking terminology extraction and classification in computational linguistics.
 * In COLING (pp. 52-63).
 */
public abstract class ACLRDTECTest {

    private static Logger LOG = Logger.getLogger(ACLRDTECTest.class.getName());

    static String workingDir = System.getProperty("user.dir");

    static String solrCoreName = "ACLRDTEC";

    static Path corpusDir = Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus", "full","xml");

    static Path solrHome = Paths.get(workingDir, "testdata", "solr-testbed");

    static Path FREQ_GENIC_FILE = Paths.get(workingDir, "testdata","solr-testbed", "ACLRDTEC",
            "conf","bnc_unifrqs.normal");

    static Path allAnnCandidTerms = Paths.get(workingDir, "src", "test", "resource", "eval",
            "ACL_RD-TEC", "terms.txt");

    // The corpus can be downloaded via
    // <a href="http://atmykitchen.info/datasets/acl_rd_tec/cleansed_text/index_cleansed_text.htm">
    // Cleansed Text Files in XML Format</a>
    public static final Path ACL_RD_TEC_CORPUS_ZIPPED_FILE =
            Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus", "full","xml.zip");

    static EmbeddedSolrServer server = null;
    static List<String> gsTerms = null;
    JATEProperties jateProp = null;

    static Lemmatiser lemmatiser = new Lemmatiser(new EngLemmatiser(
            Paths.get(workingDir, "src", "test", "resource", "lemmatiser").toString(), false, false
    ));

    public void initialise(String solrHomeDir, String solrCoreName) throws JATEException, IOException {
        if (server == null) {

            File lock = Paths.get(solrHome.toString(), solrCoreName, "data", "index", "write.lock").toFile();
            if (lock.exists()) {
                System.err.println("Previous solr did not shut down cleanly. Unlock it ...");
                Assert.assertTrue(lock.delete());
            }

            CoreContainer solrContainer = new CoreContainer(solrHomeDir);
            solrContainer.load();

            server = new EmbeddedSolrServer(solrContainer, solrCoreName);
        }

        gsTerms = GSLoader.loadACLRD(allAnnCandidTerms.toString());

        if (gsTerms == null) {
            throw new JATEException("ACLRDTEC CORPUS_DIR CONCEPT FILE CANNOT BE LOADED SUCCESSFULLY!");
        }

        jateProp = new JATEProperties();

    }

    /**
     * Corpus indexing and candidate term (at index-time)
     *
     * @param corpusDir, ACL RD-TEC cleansed text xml.zip corpus directory
     * @return List<JATETerm>
     * @throws JATEException
     */
    public void indexAndExtract(Path corpusDir) throws JATEException {
        List<Path> files = JATEUtil.loadFiles(corpusDir);

        LOG.info("indexing and extracting candidates from "+files.size()+" files...");
        int count = 0;
        for (Path file : files) {
            try {
                indexJATEDocuments(file, jateProp, false);
                count++;
                if (count % 100 == 0)
                    LOG.info("indexing done: " + count + "/" + files.size());
            }catch (NullPointerException e){
                e.printStackTrace();
            }
        }

        try {
            server.commit();
            LOG.info("complete indexing and candidate extraction.");
        } catch (SolrServerException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public static void indexJATEDocuments(Path file, JATEProperties jateProp, boolean commit) throws JATEException {
        if (file == null || file.toString().contains(".DS_Store")) {
            return;
        }
        FileInputStream fileStream = null; 
        try {
        	fileStream = new FileInputStream(file.toFile());
            JATEDocument jateDocument = JATEUtil.loadACLRDTECDocument(fileStream);

            if(jateDocument.getContent().trim().length()!=0)
                JATEUtil.addNewDoc(server, jateDocument.getId(), jateDocument.getId(), jateDocument.getContent(), jateProp, commit);
        } catch (FileNotFoundException ffe) {
            throw new JATEException(ffe.toString());
        } catch (IOException ioe) {
            throw new JATEException(String.format("failed to index [%s]", file.toString()) + ioe.toString());
        } catch (SolrServerException sse) {
            throw new JATEException(String.format("failed to index [%s] ", file.toString()) + sse.toString());
        } finally {
        	try {
        		if (fileStream!=null)
        			fileStream.close();
			} catch (IOException e) {
				LOG.error(e.toString());
			}
        }
    }

    public static long validate_indexing() {
        ModifiableSolrParams params = new ModifiableSolrParams();
        params.set("q", "*:*");

        try {
            QueryResponse qResp = server.query(params);
            SolrDocumentList docList = qResp.getResults();

            long numDocs = docList.getNumFound();
            LOG.info(String.format("[%s] documents processed!", numDocs));
            return numDocs;
        } catch (SolrServerException e) {
            e.printStackTrace();
        } catch (IOException ioe) {
            ioe.printStackTrace();
        }
        return 0;
    }

    abstract List<JATETerm> rankAndFilter(EmbeddedSolrServer server, String solrCoreName, JATEProperties jateProp) throws JATEException;

    public void evaluate(List<JATETerm> jateTerms, String algorithmName) throws JATEException {
        LOG.info(String.format("evaluating %s ...", algorithmName));
        List<String> rankedTerms = ATEResultLoader.load(jateTerms);
        double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms,
                true, false, true,
                1, 100, 1, 10,
                50, 100, 300, 500, 800, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000,9000,10000);

        //double computeOverallRecall = Scorer.computeOverallRecall(gsTerms, rankedTerms);
        double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, true, false, true,
                1, 1000, 1, 100);

        assert 0.75 == recall;

        LOG.info(String.format("=============%s ACL RD-TEC Benchmarking Results==================", algorithmName));
        validate_indexing();

        LOG.info("  top 50 Precision:" + scores[0]);
        LOG.info("  top 100 Precision:" + scores[1]);
        LOG.info("  top 300 Precision:" + scores[2]);
        LOG.info("  top 500 Precision:" + scores[3]);
        LOG.info("  top 800 Precision:" + scores[4]);
        LOG.info("  top 1000 Precision:" + scores[5]);
        LOG.info("  top 1500 Precision:" + scores[6]);
        LOG.info("  top 2000 Precision:" + scores[7]);
        LOG.info("  top 3000 Precision:" + scores[8]);
        LOG.info("  top 4000 Precision:" + scores[9]);
        LOG.info("  top 5000 Precision:" + scores[10]);
        LOG.info("  top 6000 Precision:" + scores[11]);
        LOG.info("  top 7000 Precision:" + scores[12]);
        LOG.info("  top 8000 Precision:" + scores[13]);
        LOG.info("  top 9000 Precision:" + scores[14]);
        LOG.info("  top 10000 Precision:" + scores[15]);
        LOG.info("  overall computeOverallRecall:" + recall);
    }

    /**
     * Using a set of heuristics, sections such as ‘bibliography’ and ‘acknowledgements’ are removed from the corpus
     * and are organized in separate files. In addition, text cleaning is performed, e.g. broken words and text
     * segments are joined, footnotes and captions are removed and sections are organised into paragraphs.
     *
     * @return List<JATEDocument>, documents unzipped and loaded from ACL RD TEC xml.zip
     * @throws JATEException
     */
    protected static List<JATEDocument> loadCorpus() throws JATEException {
        List<JATEDocument> jateDocuments = new ArrayList<>();
        ZipFile aclCorpus = null;
        ZipInputStream zipIn = null;
        try {
            aclCorpus = new ZipFile(ACL_RD_TEC_CORPUS_ZIPPED_FILE.toFile());

            zipIn = new ZipInputStream(new FileInputStream(ACL_RD_TEC_CORPUS_ZIPPED_FILE.toFile()));
            ZipEntry entry = zipIn.getNextEntry();

            // iterates over entries in the zip file
            while (entry != null) {
                if (!entry.isDirectory()) {
                    InputStream docInputStream = aclCorpus.getInputStream(entry);
                    JATEDocument jateDocument;
                    jateDocument = JATEUtil.loadACLRDTECDocument(docInputStream);
                    jateDocuments.add(jateDocument);

                    docInputStream.close();

                    zipIn.closeEntry();
                    entry = zipIn.getNextEntry();
                }
                zipIn.close();
            }
        } catch (IOException e) {
            throw new JATEException(ACL_RD_TEC_CORPUS_ZIPPED_FILE.toString() + " not found!");
        } finally {
        	if (zipIn != null){
				try {
					zipIn.close();
				} catch (IOException e) {
					LOG.error(e.toString());
				}
			}
        	try {
        		if (aclCorpus != null)
        			aclCorpus.close();
			} catch (IOException e) {
				LOG.error(e.toString());
			}
        }

        LOG.info("number of jate Documents:" + jateDocuments.size());

        return jateDocuments;
    }
}