package uk.ac.shef.dcs.jate.util; import org.junit.Assert; import org.junit.Test; import uk.ac.shef.dcs.jate.JATEException; import uk.ac.shef.dcs.jate.model.JATEDocument; import java.io.FileInputStream; import java.io.IOException; import java.nio.file.Path; import java.nio.file.Paths; public class JATEUtilTest { static String workingDir = System.getProperty("user.dir"); @Test public void cleanTextTest() { String text = "P r e v i o u s Efforts, C H A T - 8 0 , P R A T - 8 9 and HSQL Trondheim " + "is a small city with a university and 140000 inhabitants."; String cleanedText = JATEUtil.cleanText(text); Assert.assertTrue(("Previous Efforts, CHAT - 8 0 , PRAT - 8 9 and HSQL Trondheim is " + "a small city with a university and 140000 inhabitants.").equals(cleanedText)); } @Test public void loadDocumentText() throws JATEException, IOException { Path cleanedXMLDoc = Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus","util_test", "A00-1001_cln.xml"); FileInputStream xmlFileStream = new FileInputStream(cleanedXMLDoc.toFile()); try { JATEDocument jateDocument = JATEUtil.loadACLRDTECDocument(xmlFileStream); assert jateDocument.getId().equals("A00-1001"); assert jateDocument.getContent() != null; assert jateDocument.getContent().length() > 200; } finally { xmlFileStream.close(); } } @Test public void loadDocumentText2() throws JATEException, IOException { //check parsing error Path cleanedXMLDoc = Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus","util_test","E06-2023_cln.xml"); FileInputStream docStream = new FileInputStream(cleanedXMLDoc.toFile()); try { JATEDocument jateDocument = JATEUtil.loadACLRDTECDocument(docStream); assert jateDocument.getId().equals("E06-2023"); assert jateDocument.getContent() != null; String paragraph0 = "According to linguistic theory, morphemes are considered to be the smallest " + "meaning-bearing elements of a language. However, no adequate language-independent definition of " + "the word as a unit has been agreed upon. If effective methods can be devised for the unsupervised " + "discovery of morphemes, they could aid the formulation of a linguistic theory of morphology for a " + "new language. The utilization of morphemes as basic representational units in a statistical " + "language model instead of words seems a promising course [Creutz, 2004]."; assert jateDocument.getContent().contains(paragraph0); String paragraph1 = "Many natural language processing tasks, including parsing, semantic modeling, " + "information retrieval, and machine translation, frequently require a morphological analysis " + "of the language at hand. The task of a morphological analyzer is to identify the lexeme, " + "citation form, or inflection class of surface word forms in a language. It seems that even " + "approximate automated morphological analysis would be beneficial for many NL applications " + "dealing with large vocabularies (e.g. text retrieval applications)."; assert jateDocument.getContent().contains(paragraph1); String paragraph2 = "[Monson 2004] presents a framework for unsupervised induction of natural language " + "morphology, wherein candidate suffixes are grouped into candidate inflection classes, which are " + "then placed in a lattice structure. With similar arranged inflection classes placed near one " + "candidate in the lattice, it proposes this structure to be an ideal search space in which to " + "isolate the true inflection classes of a language. [Schone and Jurafsky 2000] presents an " + "unsupervised model in which knowledge-free distributional cues are combined orthography-based " + "with information automatically extracted from semantic word co-occurrence patterns " + "in the input corpus."; assert jateDocument.getContent().contains(paragraph2); } finally { docStream.close(); } } @Test public void testLoadACLRDTECDocument() throws JATEException, IOException { Path cleanedXMLDoc = Paths.get(workingDir, "src", "test", "resource", "eval", "ACL_RD-TEC", "corpus","util_test","E06-2023_cln.xml"); FileInputStream xmlDocStream = new FileInputStream(cleanedXMLDoc.toFile()); try { JATEDocument jateDocument = JATEUtil.loadACLRDTECDocument(xmlDocStream); assert jateDocument != null; assert jateDocument.getId() != null; assert jateDocument.getId().equals("E06-2023"); assert jateDocument.getContent() != null; //ignore reference title Assert.assertEquals(-1,jateDocument.getContent().indexOf("generation.Introduction")); //ignore reference title Assert.assertEquals(-1, jateDocument.getContent().indexOf("5.0.Fast")); //ignore reference title Assert.assertEquals(-1, jateDocument.getContent().indexOf("Fast decoding and optimal")); Assert.assertEquals(-1, jateDocument.getContent().indexOf("generation.Statistical")); } finally { xmlDocStream.close(); } } }