package uk.ac.shef.dcs.jate.app; import org.apache.log4j.Logger; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.embedded.EmbeddedSolrServer; import org.apache.solr.core.CoreContainer; import org.junit.Assert; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.JATEProperties; import uk.ac.shef.dcs.jate.model.JATEDocument; import uk.ac.shef.dcs.jate.model.JATETerm; import uk.ac.shef.dcs.jate.util.JATEUtil; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * Sample code to reproduce the result in original paper of RAKE * * To test the RAKE, Change default configuration in solr schema of "jate_text_2_terms" to use NP extraction * * * Rose, S., Engel, D., Cramer, N., & Cowley, W. (2010). * Automatic Keyword Extraction from Individual Documents. In M. W. Berry & J. Kogan (Eds.), * Text Mining: Theory and Applications: John Wiley & Sons. */ public class TestRAKE { private static final Logger LOG = Logger.getLogger(TestRAKE.class); public static void main(String[] args) throws JATEException, IOException, SolrServerException { String workingDir = System.getProperty("user.dir"); Path solrHome = Paths.get(workingDir, "testdata", "solr-testbed"); String solrHomeDir = solrHome.toString(); String solrCoreName = "ACLRDTEC"; EmbeddedSolrServer server = null; try { CoreContainer solrContainer = new CoreContainer(solrHomeDir); solrContainer.load(); server = new EmbeddedSolrServer(solrContainer, solrCoreName); JATEDocument jateDocument = new JATEDocument("sample"); jateDocument.setContent("Compatibility of systems of linear constraints over the set of natural numbers \n" + "Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and " + "nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions " + "and algorithms of construction of minimal generating sets of solutions for all types of systems " + "are given. These criteria and the corresponding algorithms for constructing a minimal " + "supporting set of solutions can be used in solving all the considered types of systems " + "and systems of mixed types."); JATEProperties jateProp = new JATEProperties(); JATEUtil.addNewDoc(server, jateDocument.getId(), jateDocument.getId(), jateDocument.getContent(), jateProp, true); LOG.info("AppRAKE ranking and filtering ... "); List<JATETerm> terms = new ArrayList<>(); Map<String, String> initParam = new HashMap<>(); initParam.put(AppParams.PREFILTER_MIN_TERM_TOTAL_FREQUENCY.getParamKey(), "1"); initParam.put(AppParams.CUTOFF_TOP_K_PERCENT.getParamKey(), "1"); AppRAKE appRAKE = new AppRAKE(initParam); terms = appRAKE.extract(server.getCoreContainer().getCore(solrCoreName), jateProp); LOG.info("complete ranking and filtering."); assert terms != null; assert terms.size() == 20; Map<String ,Double> termScoreMap = new HashMap<>(); for (JATETerm term : terms) { termScoreMap.put(term.getString(), term.getScore()); System.out.println(term.getString() + "," + term.getScore()); } /** * The sample results is consistent with original paper. * * The slight difference is caused by two reasons: * 1) candidates are generated by NP chunker rather than stop words filtering; * 2) in JATE 2.0, we do lemmetise/stemming in candidate terms analyser chain before actual scoring. * So, terms may have higher or lower score than corresponding one in original paper if plural form exists, * e.g., "set", "minimal supporting set", "system", "corresponding algorithm" */ Assert.assertEquals(Double.valueOf(8.5),termScoreMap.get("linear diophantine equation")); Assert.assertEquals(Double.valueOf(7.916666666666666),termScoreMap.get("minimal supporting set")); Assert.assertEquals(Double.valueOf(4.916666666666666),termScoreMap.get("minimal set")); Assert.assertEquals(Double.valueOf(4.5),termScoreMap.get("corresponding algorithm")); Assert.assertEquals(Double.valueOf(4.5),termScoreMap.get("linear constraint")); Assert.assertEquals(Double.valueOf(4.0),termScoreMap.get("strict inequation")); Assert.assertEquals(Double.valueOf(4.0),termScoreMap.get("nonstrict inequation")); Assert.assertEquals(Double.valueOf(3.666666666666667),termScoreMap.get("mixed type")); Assert.assertEquals(Double.valueOf(2.5),termScoreMap.get("solutions and algorithm")); Assert.assertEquals(Double.valueOf(2.25),termScoreMap.get("set")); Assert.assertEquals(Double.valueOf(1.6666666666666667),termScoreMap.get("type")); Assert.assertEquals(Double.valueOf(3.166666666666667),termScoreMap.get("considered type")); Assert.assertEquals(Double.valueOf(1.4),termScoreMap.get("system")); Assert.assertEquals(Double.valueOf(1.4),termScoreMap.get("systems and system")); Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("upper")); Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("component")); Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("solution")); Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("construction")); Assert.assertEquals(Double.valueOf(1.0),termScoreMap.get("compatibility")); } finally { if (server!=null) { server.getCoreContainer().getCore(solrCoreName).close(); server.getCoreContainer().shutdown(); server.close(); } } System.exit(0); } }