package uk.ac.shef.dcs.jate.app;
import dragon.nlp.tool.lemmatiser.EngLemmatiser;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.core.CoreContainer;
import org.junit.Assert;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.eval.ATEResultLoader;
import uk.ac.shef.dcs.jate.eval.GSLoader;
import uk.ac.shef.dcs.jate.eval.Scorer;
import uk.ac.shef.dcs.jate.model.JATEDocument;
import uk.ac.shef.dcs.jate.model.JATETerm;
import uk.ac.shef.dcs.jate.nlp.Lemmatiser;
import uk.ac.shef.dcs.jate.util.JATEUtil;
import java.io.*;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.*;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;
/**
* Tests for App* of a set of ATE algorithms & The ACL RD-TEC based benchmarking test based on Embedded Solr.
* <p>
* ACL RD-TEC contains almost 11,000 scientific publications. Due to the performance and efficiency reasons,
* the test and benchmarking tests can only be run manually.
*
* @see AppATEACLRDTECTest
* @see <p>
* The ACL RD-TEC stands for The ACL Reference Dataset for Terminology Extraction and Classification.
* @see <a href="http://catalog.elra.info/product_info.php?products_id=1236">European Language Resources Association</a>
* @see <a href="http://acl-arc.comp.nus.edu.sg/">ACL Anthology Reference Corpus (ACL ARC)</a>
* @see <a href="http://atmykitchen.info/datasets/acl_rd_tec/">The dataset</a>
* <p>
* Raw text files can be downloaded via
* <a href="Resources from ACL ARC">
* http://atmykitchen.info/datasets/acl_rd_tec/external_resource/index_external_resource.htm</a>
* <p>
* For more details about ACL RD-TEC corpus, please refer to the paper:
* <p>
* Zadeh, B. Q., & Handschuh, S. (2014).
* The ACL RD-TEC: a dataset for benchmarking terminology extraction and classification in computational linguistics.
* In COLING (pp. 52-63).
*/
public abstract class ACLRDTECTest {
private static Logger LOG = Logger.getLogger(ACLRDTECTest.class.getName());
static String workingDir = System.getProperty("user.dir");
static String solrCoreName = "ACLRDTEC";
static Path corpusDir = Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus", "full","xml");
static Path solrHome = Paths.get(workingDir, "testdata", "solr-testbed");
static Path FREQ_GENIC_FILE = Paths.get(workingDir, "testdata","solr-testbed", "ACLRDTEC",
"conf","bnc_unifrqs.normal");
static Path allAnnCandidTerms = Paths.get(workingDir, "src", "test", "resource", "eval",
"ACL_RD-TEC", "terms.txt");
// The corpus can be downloaded via
// <a href="http://atmykitchen.info/datasets/acl_rd_tec/cleansed_text/index_cleansed_text.htm">
// Cleansed Text Files in XML Format</a>
public static final Path ACL_RD_TEC_CORPUS_ZIPPED_FILE =
Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus", "full","xml.zip");
static EmbeddedSolrServer server = null;
static List<String> gsTerms = null;
JATEProperties jateProp = null;
static Lemmatiser lemmatiser = new Lemmatiser(new EngLemmatiser(
Paths.get(workingDir, "src", "test", "resource", "lemmatiser").toString(), false, false
));
public void initialise(String solrHomeDir, String solrCoreName) throws JATEException, IOException {
if (server == null) {
File lock = Paths.get(solrHome.toString(), solrCoreName, "data", "index", "write.lock").toFile();
if (lock.exists()) {
System.err.println("Previous solr did not shut down cleanly. Unlock it ...");
Assert.assertTrue(lock.delete());
}
CoreContainer solrContainer = new CoreContainer(solrHomeDir);
solrContainer.load();
server = new EmbeddedSolrServer(solrContainer, solrCoreName);
}
gsTerms = GSLoader.loadACLRD(allAnnCandidTerms.toString());
if (gsTerms == null) {
throw new JATEException("ACLRDTEC CORPUS_DIR CONCEPT FILE CANNOT BE LOADED SUCCESSFULLY!");
}
jateProp = new JATEProperties();
}
/**
* Corpus indexing and candidate term (at index-time)
*
* @param corpusDir, ACL RD-TEC cleansed text xml.zip corpus directory
* @return List<JATETerm>
* @throws JATEException
*/
public void indexAndExtract(Path corpusDir) throws JATEException {
List<Path> files = JATEUtil.loadFiles(corpusDir);
LOG.info("indexing and extracting candidates from "+files.size()+" files...");
int count = 0;
for (Path file : files) {
try {
indexJATEDocuments(file, jateProp, false);
count++;
if (count % 100 == 0)
LOG.info("indexing done: " + count + "/" + files.size());
}catch (NullPointerException e){
e.printStackTrace();
}
}
try {
server.commit();
LOG.info("complete indexing and candidate extraction.");
} catch (SolrServerException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
public static void indexJATEDocuments(Path file, JATEProperties jateProp, boolean commit) throws JATEException {
if (file == null || file.toString().contains(".DS_Store")) {
return;
}
FileInputStream fileStream = null;
try {
fileStream = new FileInputStream(file.toFile());
JATEDocument jateDocument = JATEUtil.loadACLRDTECDocument(fileStream);
if(jateDocument.getContent().trim().length()!=0)
JATEUtil.addNewDoc(server, jateDocument.getId(), jateDocument.getId(), jateDocument.getContent(), jateProp, commit);
} catch (FileNotFoundException ffe) {
throw new JATEException(ffe.toString());
} catch (IOException ioe) {
throw new JATEException(String.format("failed to index [%s]", file.toString()) + ioe.toString());
} catch (SolrServerException sse) {
throw new JATEException(String.format("failed to index [%s] ", file.toString()) + sse.toString());
} finally {
try {
if (fileStream!=null)
fileStream.close();
} catch (IOException e) {
LOG.error(e.toString());
}
}
}
public static long validate_indexing() {
ModifiableSolrParams params = new ModifiableSolrParams();
params.set("q", "*:*");
try {
QueryResponse qResp = server.query(params);
SolrDocumentList docList = qResp.getResults();
long numDocs = docList.getNumFound();
LOG.info(String.format("[%s] documents processed!", numDocs));
return numDocs;
} catch (SolrServerException e) {
e.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
}
return 0;
}
abstract List<JATETerm> rankAndFilter(EmbeddedSolrServer server, String solrCoreName, JATEProperties jateProp) throws JATEException;
public void evaluate(List<JATETerm> jateTerms, String algorithmName) throws JATEException {
LOG.info(String.format("evaluating %s ...", algorithmName));
List<String> rankedTerms = ATEResultLoader.load(jateTerms);
double[] scores = Scorer.computePrecisionAtRank(lemmatiser,gsTerms, rankedTerms,
true, false, true,
1, 100, 1, 10,
50, 100, 300, 500, 800, 1000, 1500, 2000, 3000, 4000, 5000, 6000, 7000, 8000,9000,10000);
//double computeOverallRecall = Scorer.computeOverallRecall(gsTerms, rankedTerms);
double recall = Scorer.computeOverallRecall(gsTerms, rankedTerms, lemmatiser, true, false, true,
1, 1000, 1, 100);
assert 0.75 == recall;
LOG.info(String.format("=============%s ACL RD-TEC Benchmarking Results==================", algorithmName));
validate_indexing();
LOG.info(" top 50 Precision:" + scores[0]);
LOG.info(" top 100 Precision:" + scores[1]);
LOG.info(" top 300 Precision:" + scores[2]);
LOG.info(" top 500 Precision:" + scores[3]);
LOG.info(" top 800 Precision:" + scores[4]);
LOG.info(" top 1000 Precision:" + scores[5]);
LOG.info(" top 1500 Precision:" + scores[6]);
LOG.info(" top 2000 Precision:" + scores[7]);
LOG.info(" top 3000 Precision:" + scores[8]);
LOG.info(" top 4000 Precision:" + scores[9]);
LOG.info(" top 5000 Precision:" + scores[10]);
LOG.info(" top 6000 Precision:" + scores[11]);
LOG.info(" top 7000 Precision:" + scores[12]);
LOG.info(" top 8000 Precision:" + scores[13]);
LOG.info(" top 9000 Precision:" + scores[14]);
LOG.info(" top 10000 Precision:" + scores[15]);
LOG.info(" overall computeOverallRecall:" + recall);
}
/**
* Using a set of heuristics, sections such as ‘bibliography’ and ‘acknowledgements’ are removed from the corpus
* and are organized in separate files. In addition, text cleaning is performed, e.g. broken words and text
* segments are joined, footnotes and captions are removed and sections are organised into paragraphs.
*
* @return List<JATEDocument>, documents unzipped and loaded from ACL RD TEC xml.zip
* @throws JATEException
*/
protected static List<JATEDocument> loadCorpus() throws JATEException {
List<JATEDocument> jateDocuments = new ArrayList<>();
ZipFile aclCorpus = null;
ZipInputStream zipIn = null;
try {
aclCorpus = new ZipFile(ACL_RD_TEC_CORPUS_ZIPPED_FILE.toFile());
zipIn = new ZipInputStream(new FileInputStream(ACL_RD_TEC_CORPUS_ZIPPED_FILE.toFile()));
ZipEntry entry = zipIn.getNextEntry();
// iterates over entries in the zip file
while (entry != null) {
if (!entry.isDirectory()) {
InputStream docInputStream = aclCorpus.getInputStream(entry);
JATEDocument jateDocument;
jateDocument = JATEUtil.loadACLRDTECDocument(docInputStream);
jateDocuments.add(jateDocument);
docInputStream.close();
zipIn.closeEntry();
entry = zipIn.getNextEntry();
}
zipIn.close();
}
} catch (IOException e) {
throw new JATEException(ACL_RD_TEC_CORPUS_ZIPPED_FILE.toString() + " not found!");
} finally {
if (zipIn != null){
try {
zipIn.close();
} catch (IOException e) {
LOG.error(e.toString());
}
}
try {
if (aclCorpus != null)
aclCorpus.close();
} catch (IOException e) {
LOG.error(e.toString());
}
}
LOG.info("number of jate Documents:" + jateDocuments.size());
return jateDocuments;
}
}