package uk.ac.shef.dcs.jate.app;
import org.apache.log4j.Logger;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer;
import org.apache.solr.core.CoreContainer;
import org.junit.Assert;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.JATEProperties;
import uk.ac.shef.dcs.jate.model.JATEDocument;
import uk.ac.shef.dcs.jate.model.JATETerm;
import uk.ac.shef.dcs.jate.util.JATEUtil;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Sample code to reproduce the result in original paper of RAKE
*
* To test the RAKE, Change default configuration in solr schema of "jate_text_2_terms" to use NP extraction
*
*
* Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010).
* Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.),
* Text Mining: Theory and Applications: John Wiley & Sons.
*/
public class TestRAKE {
private static final Logger LOG = Logger.getLogger(TestRAKE.class);
public static void main(String[] args) throws JATEException, IOException, SolrServerException {
String workingDir = System.getProperty("user.dir");
Path solrHome = Paths.get(workingDir, "testdata", "solr-testbed");
String solrHomeDir = solrHome.toString();
String solrCoreName = "ACLRDTEC";
EmbeddedSolrServer server = null;
try {
CoreContainer solrContainer = new CoreContainer(solrHomeDir);
solrContainer.load();
server = new EmbeddedSolrServer(solrContainer, solrCoreName);
JATEDocument jateDocument = new JATEDocument("sample");
jateDocument.setContent("Compatibility of systems of linear constraints over the set of natural numbers \n" +
"Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and " +
"nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions " +
"and algorithms of construction of minimal generating sets of solutions for all types of systems " +
"are given. These criteria and the corresponding algorithms for constructing a minimal " +
"supporting set of solutions can be used in solving all the considered types of systems " +
"and systems of mixed types.");
JATEProperties jateProp = new JATEProperties();
JATEUtil.addNewDoc(server, jateDocument.getId(), jateDocument.getId(), jateDocument.getContent(), jateProp, true);
LOG.info("AppRAKE ranking and filtering ... ");
List<JATETerm> terms = new ArrayList<>();
Map<String, String> initParam = new HashMap<>();
initParam.put(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey(), "1");
initParam.put(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey(), "1");
AppRAKE appRAKE = new AppRAKE(initParam);
terms = appRAKE.extract(server.getCoreContainer().getCore(solrCoreName), jateProp);
LOG.info("complete ranking and filtering.");
assert terms != null;
assert terms.size() == 20;
Map<String ,Double> termScoreMap = new HashMap<>();
for (JATETerm term : terms) {
termScoreMap.put(term.getString(), term.getScore());
System.out.println(term.getString() + "," + term.getScore());
}
/**
* The sample results is consistent with original paper.
*
* The slight difference is caused by two reasons:
* 1) candidates are generated by NP chunker rather than stop words filtering;
* 2) in JATE 2.0, we do lemmetise/stemming in candidate terms analyser chain before actual scoring.
* So, terms may have higher or lower score than corresponding one in original paper if plural form exists,
* e.g., "set", "minimal supporting set", "system", "corresponding algorithm"
*/
Assert.assertEquals(Double.valueOf(8.5),termScoreMap.get("linear diophantine equation"));
Assert.assertEquals(Double.valueOf(7.916666666666666),termScoreMap.get("minimal supporting set"));
Assert.assertEquals(Double.valueOf(4.916666666666666),termScoreMap.get("minimal set"));
Assert.assertEquals(Double.valueOf(4.5),termScoreMap.get("corresponding algorithm"));
Assert.assertEquals(Double.valueOf(4.5),termScoreMap.get("linear constraint"));
Assert.assertEquals(Double.valueOf(4.0),termScoreMap.get("strict inequation"));
Assert.assertEquals(Double.valueOf(4.0),termScoreMap.get("nonstrict inequation"));
Assert.assertEquals(Double.valueOf(3.666666666666667),termScoreMap.get("mixed type"));
Assert.assertEquals(Double.valueOf(2.5),termScoreMap.get("solutions and algorithm"));
Assert.assertEquals(Double.valueOf(2.25),termScoreMap.get("set"));
Assert.assertEquals(Double.valueOf(1.6666666666666667),termScoreMap.get("type"));
Assert.assertEquals(Double.valueOf(3.166666666666667),termScoreMap.get("considered type"));
Assert.assertEquals(Double.valueOf(1.4),termScoreMap.get("system"));
Assert.assertEquals(Double.valueOf(1.4),termScoreMap.get("systems and system"));
Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("upper"));
Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("component"));
Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("solution"));
Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("construction"));
Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("compatibility"));
} finally {
if (server!=null) {
server.getCoreContainer().getCore(solrCoreName).close();
server.getCoreContainer().shutdown();
server.close();
}
}
System.exit(0);
}
}