package uk.ac.shef.dcs.jate.app; import dragon.nlp.tool.lemmatiser.EngLemmatiser; import org.apache.log4j.Logger; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.sax.BodyContentHandler; import org.junit.*; import org.xml.sax.SAXException; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.eval.ATEResultLoader; import uk.ac.shef.dcs.jate.eval.GSLoader; import uk.ac.shef.dcs.jate.eval.Scorer; import uk.ac.shef.dcs.jate.model.JATEDocument; import uk.ac.shef.dcs.jate.model.JATETerm; import uk.ac.shef.dcs.jate.nlp.Lemmatiser; import java.io.*; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; /** * Unit tests for App* of a set of ATE algorithms & GENIA based benchmarking test based on Embedded Solr * <p> * Default Solr config is retrieved through testdata/solr-testbed * Default jate.properties is loaded from classpath * Enable PoS pattern based candidate extraction in Solr (see field type "jate_text_2_terms" for reference) */ public class AppATEGENIATest extends BaseEmbeddedSolrTest { private static Logger LOG = Logger.getLogger(AppATEGENIATest.class.getName()); public static final Path GENIA_CORPUS_ZIPPED_FILE = Paths.get(workingDir, "src", "test", "resource", "eval", "GENIA", "corpus.zip"); public static final Path GENIA_CORPUS_CONCEPT_FILE = Paths.get(workingDir, "src", "test", "resource", "eval", "GENIA", "concept.txt"); public static final int EXPECTED_CANDIDATE_SIZE=38805; static Lemmatiser lemmatiser = new Lemmatiser(new EngLemmatiser( Paths.get(workingDir, "src", "test", "resource", "lemmatiser").toString(), false, false )); JATEProperties jateProperties = null; List<String> gsTerms; Map<String, String> initParams = null; private static boolean isIndexed = false; // evaluation conditions for GENIA corpus private static int EVAL_CONDITION_MIN_TERM_CONTEXT_FREQUENCY = 1; private static int EVAL_CONDITION_MIN_TERM_TOTAL_FREQUENCY = 1; private static int EVAL_CONDITION_CUTOFF_TOP_K_PERCENT = 1; private static boolean EVAL_CONDITION_IGNORE_SYMBOL = true; private static boolean EVAL_CONDITION_IGNORE_DIGITS = false; private static boolean EVAL_CONDITION_CASE_INSENSITIVE = true; private static int EVAL_CONDITION_CHAR_RANGE_MIN = 1; private static int EVAL_CONDITION_CHAR_RANGE_MAX = -1; private static int EVAL_CONDITION_TOKEN_RANGE_MIN = 1; private static int EVAL_CONDITION_TOKEN_RANGE_MAX = -1; private static boolean exportData = true; private static int[] EVAL_CONDITION_TOP_N = {50, 100, 300, 500, 800, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000,9000,10000, 15000, 20000, 25000, 30000}; public static String SOLR_CORE_NAME = "GENIA"; protected void setSolrCoreName() { solrCoreName = "GENIA"; } protected void setReindex() { //change this to false if you want to use existing index //always set to true for the automatic test reindex = false; } @BeforeClass public static void cleanData() { try { cleanIndexDirectory(solrHome.toString(), SOLR_CORE_NAME); cleanIndexDirectory(solrHome.toString(), "ACLRDTEC"); cleanIndexDirectory(solrHome.toString(), "jateCore"); } catch (IOException e) { e.printStackTrace(); } } @Before public void setup() throws Exception { super.setup(); LOG.info("Initialising evaluation/test of available ATE algorithms on GENIA dataset ... "); jateProperties = new JATEProperties(); System.out.println("======================================== is Indexed ? : "+ isIndexed); if (!isIndexed || reindex) { try { LOG.info("starting to indexing genia corpus ... "); indexCorpus(loadGENIACorpus()); LOG.info("complete document and term candidates indexing."); } catch (IOException ioe) { throw new JATEException("Unable to delete index data. Please clean index directory " + "[testdata/solr-testbed/jate/data] manually!"); } } else { LOG.info(" Skip document and term candidate indexing. "); } gsTerms = GSLoader.loadGenia(GENIA_CORPUS_CONCEPT_FILE.toString()); if (gsTerms == null) { throw new JATEException("GENIA CORPUS_DIR CONCEPT FILE CANNOT BE LOADED SUCCESSFULLY!"); } initParams = new HashMap<>(); initParams.put(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey(), String.valueOf(EVAL_CONDITION_MIN_TERM_TOTAL_FREQUENCY)); initParams.put(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey(), String.valueOf(EVAL_CONDITION_CUTOFF_TOP_K_PERCENT)); LOG.info("<<TEST BEGINS WITH following conditions: >>"); LOG.info(String.format("Evaluation of topN precision and overall P/R/F based on on lemmatised terms, " + "ignore symbol? [%s], ignore digits? [%s], case-insensitive? [%s], " + "char range filtering: [%s,%s], token-range filtering: [%s,%s], " + "pre-filtering min total freq: [%s], cut-off Top K precent: [%s] " + "and min context (co-occur) frequency: [%s]", EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_MIN_TERM_TOTAL_FREQUENCY, EVAL_CONDITION_CUTOFF_TOP_K_PERCENT, EVAL_CONDITION_MIN_TERM_CONTEXT_FREQUENCY)); } @AfterClass public static void tearDownAll() { try { cleanIndexDirectory(solrHome.toString(), SOLR_CORE_NAME); } catch (IOException e) { e.printStackTrace(); } } protected List<JATEDocument> loadGENIACorpus() throws JATEException { AutoDetectParser parser = new AutoDetectParser(); BodyContentHandler handler = null; Metadata metadata = new Metadata(); List<JATEDocument> corpus = new ArrayList<>(); ZipFile geniaCorpus = null; try { geniaCorpus = new ZipFile(GENIA_CORPUS_ZIPPED_FILE.toFile()); Enumeration<? extends ZipEntry> entries = geniaCorpus.entries(); while (entries.hasMoreElements()) { ZipEntry entry = entries.nextElement(); String fileName = entry.getName(); //skip file in MAC OS if (entry.isDirectory() || fileName.startsWith("__MACOSX/") || fileName.contains(".DS_Store")) continue; InputStream stream = geniaCorpus.getInputStream(entry); handler = new BodyContentHandler(-1); try { parser.parse(stream, handler, metadata); String content = handler.toString(); JATEDocument doc = new JATEDocument(fileName); doc.setContent(content); corpus.add(doc); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } finally { stream.close(); } } } catch (IOException e) { throw new JATEException(String.format("GENIA Corpus not found from %s", GENIA_CORPUS_ZIPPED_FILE)); } finally { if (geniaCorpus != null) { try { geniaCorpus.close(); } catch (IOException e) { LOG.error(e.toString()); } } } return corpus; } protected void indexCorpus(List<JATEDocument> corpus) throws IOException, SolrServerException { int count = 0; long startTime = System.currentTimeMillis(); for (JATEDocument doc : corpus) { try { count++; super.addNewDoc(doc.getId(), doc.getId(), doc.getContent(), jateProperties, false); if (count % 500 == 0) { LOG.info(String.format("%s documents indexed.", count)); } } catch (IOException e) { e.printStackTrace(); } catch (SolrServerException e) { e.printStackTrace(); } catch (JATEException jateEx) { jateEx.printStackTrace(); LOG.warn(String.format("failed to index document. Please check JATE properties " + "for current setting for [%s] and [%s]", JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_NGRAMS, JATEProperties.PROPERTY_SOLR_FIELD_CONTENT_TERMS)); } } long endTime = System.currentTimeMillis(); LOG.info(String.format("Indexing and candidate extraction took [%s] milliseconds", (endTime - startTime))); server.commit(); isIndexed = true; } @Test public void validate_indexing() { ModifiableSolrParams params = new ModifiableSolrParams(); params.set("q", "*:*"); try { QueryResponse qResp = server.query(params); SolrDocumentList docList = qResp.getResults(); assert (docList.getNumFound() == 2000); } catch (SolrServerException e) { e.printStackTrace(); } catch (IOException ioe) { ioe.printStackTrace(); } } @Test public void benchmarking_appATTF() throws JATEException, IOException { AppATTF appATTF = new AppATTF(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appATTF.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appATTF ranking took [%s] milliseconds", (endTime - startTime))); assert termList != null; if (exportData) { appATTF.outputFile = "attf_genia.json"; appATTF.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); LOG.info("=============ATTF GENIA Benchmarking Results=================="); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); printResults(scores, precision, recall); assert 0.94 == scores[0]; assert 0.93 == scores[1]; assert 0.94 == scores[2]; assert 0.93 == scores[3]; assert 0.9 == scores[4]; assert 0.9 == scores[5]; assert 0.87 == scores[6]; assert 0.86 == scores[7]; assert 0.84 == scores[8]; assert 0.83 == scores[9]; assert 0.79 == scores[10]; assert 0.77 == scores[11]; assert 0.76 == scores[12]; assert 0.75 == scores[13]; assert 0.74 == scores[14]; assert 0.74 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appChiSquare() throws IOException, JATEException { initParams.put(AppParams.PREFILTER_MIN_TERM_CONTEXT_FREQUENCY.getParamKey(), String.valueOf(EVAL_CONDITION_MIN_TERM_CONTEXT_FREQUENCY)); initParams.put(AppParams.CHISQUERE_FREQ_TERM_CUTOFF_PERCENTAGE.getParamKey(), String.valueOf(EVAL_CONDITION_CUTOFF_TOP_K_PERCENT)); AppChiSquare appChiSquare = new AppChiSquare(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appChiSquare.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appChiSquare ranking took [%s] milliseconds", (endTime - startTime))); assert termList != null; if (exportData) { appChiSquare.outputFile = "chisquare_genia.json"; appChiSquare.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); LOG.info("=============CHISQUARE GENIA Benchmarking Results=================="); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); printResults(scores,precision, recall); assert 0.6 == scores[0]; assert 0.57 == scores[1]; assert 0.57 == scores[2]; assert 0.55 == scores[3]; assert 0.57 == scores[4]; assert 0.58 == scores[5]; assert 0.6 == scores[6]; assert 0.62 == scores[7]; assert 0.64 == scores[8]; assert 0.65 == scores[9]; assert 0.66 == scores[10]; assert 0.66 == scores[11]; assert 0.66 == scores[12]; assert 0.66 == scores[13]; assert 0.66 == scores[14]; assert 0.65 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appCValue() throws IOException, JATEException { AppCValue appCValue = new AppCValue(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appCValue.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appCValue ranking took [%s] milliseconds", (endTime - startTime))); assert termList != null; if (exportData) { appCValue.outputFile = "cvalue_genia.json"; appCValue.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); LOG.info("=============CVALUE GENIA Benchmarking Results=================="); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); printResults(scores, precision, recall); assert 0.78 == scores[0]; assert 0.77 == scores[1]; assert 0.76 == scores[2]; assert 0.75 == scores[3]; assert 0.75 == scores[4]; assert 0.77 == scores[5]; assert 0.76 == scores[6]; assert 0.76 == scores[7]; assert 0.76 == scores[8]; assert 0.74 == scores[9]; assert 0.74 == scores[10]; assert 0.72 == scores[11]; assert 0.72 == scores[12]; assert 0.72 == scores[13]; assert 0.7 == scores[14]; assert 0.68 == scores[15]; assert 0.56 == precision; assert 0.71 == recall; assert 0.63 == Scorer.getFMeasure(precision, recall); } @Test public void benchmarking_appGlossEx() throws JATEException, IOException { initParams.put(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey(), REF_FREQ_FILE.toString()); AppGlossEx appGlossEx = new AppGlossEx(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appGlossEx.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appGlossEx ranking took [%s] milliseconds", (endTime - startTime))); if (exportData) { appGlossEx.outputFile = "glossex_genia.json"; appGlossEx.write(termList); } LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); LOG.info("=============GLOSSEX GENIA Benchmarking Results=================="); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); printResults(scores, precision, recall); assert 0.74 == scores[0]; assert 0.54 == scores[1]; assert 0.62 == scores[2]; assert 0.66 == scores[3]; assert 0.64 == scores[4]; assert 0.66 == scores[5]; assert 0.65 == scores[6]; assert 0.66 == scores[7]; assert 0.66 == scores[8]; assert 0.66 == scores[9]; assert 0.67 == scores[10]; assert 0.67 == scores[11]; assert 0.67 == scores[12]; assert 0.67 == scores[13]; assert 0.67 == scores[14]; assert 0.67 == scores[15]; assert 0.71 == recall; } private void printResults(double[] scores, double precision, double recall) { int topNIndex = 0; for (int topN : EVAL_CONDITION_TOP_N) { LOG.info(String.format(" top %s Precision: %s", topN, scores[topNIndex]) ); topNIndex++; } LOG.info(" overall precision: " + precision); LOG.info(" overall computeOverallRecall: " + recall); LOG.info(" overall F-measure: " + Scorer.getFMeasure(precision, recall)); } @Test public void benchmarking_appRAKE() throws JATEException, IOException { AppRAKE appRAKE = new AppRAKE(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appRAKE.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appRAKE ranking took [%s] milliseconds", (endTime - startTime))); assert termList != null; if (exportData) { appRAKE.outputFile = "rake_genia.json"; appRAKE.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); LOG.info("=============RAKE GENIA Benchmarking Results=================="); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); printResults(scores, precision, recall); assert 0.7 == scores[0]; assert 0.7 == scores[1]; assert 0.62 == scores[2]; assert 0.63 == scores[3]; assert 0.64 == scores[4]; assert 0.63== scores[5]; assert 0.63 == scores[6]; assert 0.62 == scores[7]; assert 0.62 == scores[8]; assert 0.61 == scores[9]; assert 0.61 == scores[10]; assert 0.61 == scores[11]; assert 0.62 == scores[12]; assert 0.62 == scores[13]; assert 0.62 == scores[14]; assert 0.61 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appRIDF() throws JATEException, IOException { AppRIDF appRIDF = new AppRIDF(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appRIDF.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appRIDF ranking took [%s] milliseconds", (endTime - startTime))); assert termList != null; if (exportData) { appRIDF.outputFile = "ridf_genia.json"; appRIDF.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); LOG.info("=============RIDF GENIA Benchmarking Results=================="); printResults(scores, precision, recall); assert 0.86 == scores[0]; assert 0.9 == scores[1]; assert 0.89 == scores[2]; assert 0.89 == scores[3]; assert 0.86 == scores[4]; assert 0.84 == scores[5]; assert 0.81 == scores[6]; assert 0.79 == scores[7]; assert 0.77 == scores[8]; assert 0.76 == scores[9]; assert 0.76 == scores[10]; assert 0.76 == scores[11]; assert 0.75 == scores[12]; assert 0.73 == scores[13]; assert 0.72 == scores[14]; assert 0.71 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appTermEx() throws JATEException, IOException { initParams.put(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey(), REF_FREQ_FILE.toString()); AppTermEx appTermEx = new AppTermEx(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appTermEx.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appTermEx ranking took [%s] milliseconds", (endTime - startTime))); if (exportData) { appTermEx.outputFile = "termex_genia.json"; appTermEx.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); LOG.info("=============TERMEX GENIA Benchmarking Results=================="); printResults(scores, precision, recall); assert 0.72 == scores[0]; assert 0.66 == scores[1]; assert 0.68 == scores[2]; assert 0.69 == scores[3]; assert 0.7 == scores[4]; assert 0.7 == scores[5]; assert 0.7 == scores[6]; assert 0.7 == scores[7]; assert 0.7 == scores[8]; assert 0.7 == scores[9]; assert 0.71 == scores[10]; assert 0.7 == scores[11]; assert 0.69 == scores[12]; assert 0.68 == scores[13]; assert 0.67 == scores[14]; assert 0.67 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appTFIDF() throws JATEException, IOException { AppTFIDF appTFIDF = new AppTFIDF(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appTFIDF.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appTFIDF ranking took [%s] milliseconds", (endTime - startTime))); LOG.info("candidate size:" + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); if (exportData) { appTFIDF.outputFile = "tfidf_genia.json"; appTFIDF.write(termList); } List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); LOG.info("=============TFIDF GENIA Benchmarking Results=================="); printResults(scores, precision, recall); assert 0.68 == scores[0]; assert 0.65 == scores[1]; assert 0.62 == scores[2]; assert 0.62 == scores[3]; assert 0.61 == scores[4]; assert 0.61 == scores[5]; assert 0.63 == scores[6]; assert 0.65 == scores[7]; assert 0.69 == scores[8]; assert 0.7 == scores[9]; assert 0.71 == scores[10]; assert 0.71 == scores[11]; assert 0.71 == scores[12]; assert 0.71 == scores[13]; assert 0.72 == scores[14]; assert 0.71 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appTTF() throws JATEException, IOException { AppTTF appTTF = new AppTTF(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appTTF.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appTTF ranking took [%s] milliseconds", (endTime - startTime))); if (exportData) { appTTF.outputFile = "ttf_genia.json"; appTTF.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("Candidate size: " + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); LOG.info("=============TTF GENIA Benchmarking Results=================="); printResults(scores, precision, recall); assert 0.58 == scores[0]; assert 0.58 == scores[1]; assert 0.57 == scores[2]; assert 0.57 == scores[3]; assert 0.58 == scores[4]; assert 0.59 == scores[5]; assert 0.61 == scores[6]; assert 0.64 == scores[7]; assert 0.67 == scores[8]; assert 0.69 == scores[9]; assert 0.7 == scores[10]; assert 0.71 == scores[11]; assert 0.71 == scores[12]; assert 0.71 == scores[13]; assert 0.7 == scores[14]; assert 0.7 == scores[15]; assert 0.71 == recall; } @Test public void benchmarking_appWeirdness() throws JATEException, IOException { initParams.put(AppParams.REFERENCE_FREQUENCY_FILE.getParamKey(), REF_FREQ_FILE.toString()); AppWeirdness appWeirdness = new AppWeirdness(initParams); long startTime = System.currentTimeMillis(); List<JATETerm> termList = appWeirdness.extract(server.getCoreContainer().getCore(solrCoreName), jateProperties); long endTime = System.currentTimeMillis(); LOG.info(String.format("appWeirdness ranking took [%s] milliseconds", (endTime - startTime))); if (exportData) { appWeirdness.outputFile = "weirdness_genia.json"; appWeirdness.write(termList); } // the results depends on specified PoS patterns // refer to genia.patterns in solr config for the default candidate extraction patterns // candidate extraction is performed at index-time LOG.info("Candidate size: " + termList.size()); Assert.assertEquals("Candidate size should be "+EXPECTED_CANDIDATE_SIZE, EXPECTED_CANDIDATE_SIZE, termList.size()); List<String> rankedTerms = ATEResultLoader.load(termList); double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX, EVAL_CONDITION_TOP_N); double precision = Scorer.computeOverallPrecision(lemmatiser,gsTerms, rankedTerms, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, EVAL_CONDITION_IGNORE_SYMBOL, EVAL_CONDITION_IGNORE_DIGITS, EVAL_CONDITION_CASE_INSENSITIVE, EVAL_CONDITION_CHAR_RANGE_MIN, EVAL_CONDITION_CHAR_RANGE_MAX, EVAL_CONDITION_TOKEN_RANGE_MIN, EVAL_CONDITION_TOKEN_RANGE_MAX); LOG.info("=============WEIRDNESS GENIA Benchmarking Results=================="); printResults(scores, precision, recall); assert 0.7 == scores[0]; assert 0.78 == scores[1]; assert 0.73 == scores[2]; assert 0.76 == scores[3]; assert 0.78 == scores[4]; assert 0.77 == scores[5]; assert 0.77 == scores[6]; assert 0.77 == scores[7]; assert 0.75 == scores[8]; assert 0.75 == scores[9]; assert 0.73 == scores[10]; assert 0.73 == scores[11]; assert 0.73 == scores[12]; assert 0.72 == scores[13]; assert 0.71 == scores[14]; assert 0.71 == scores[15]; assert 0.71 == recall; } }