//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.collectionreaders; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.IOException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import org.apache.uima.UIMAException; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import uk.gov.dstl.baleen.collectionreaders.testing.AbstractReaderTest; import uk.gov.dstl.baleen.uima.BaleenCollectionReader; public class ReutersReaderTest extends AbstractReaderTest { public ReutersReaderTest() { super(ReutersReader.class); } private final static String SGML = "<!DOCTYPE lewis SYSTEM \"lewis.dtd\">\n" + "<REUTERS TOPICS=\"YES\" LEWISSPLIT=\"TRAIN\" CGISPLIT=\"TRAINING-SET\" OLDID=\"5544\" NEWID=\"1\">\n" + "<DATE>1-JAN-1990 10:11:12.13</DATE>\n" + "<TOPICS><D>topics</D></TOPICS>\n" + "<PLACES><D>uk</D><D>usa</D></PLACES>\n" + "<PEOPLE></PEOPLE>\n" + "<ORGS></ORGS>\n" + "<EXCHANGES></EXCHANGES>\n" + "<COMPANIES></COMPANIES>\n" + "<UNKNOWN> \n" + "C T\n" + "f0704reute\n" + "</UNKNOWN>\n" + "<TEXT>\n" + "<TITLE>TITLE</TITLE>\n" + "<DATELINE> DATELINE </DATELINE><BODY> Some example\n" + "text. \n" + "Reuter\n" + "</BODY></TEXT>\n" + "</REUTERS>\n" + "<REUTERS TOPICS=\"NO\" LEWISSPLIT=\"TRAIN\" CGISPLIT=\"TRAINING-SET\" OLDID=\"2\" NEWID=\"2\">\n" + "<DATE>2-FEB-2002 20:21:22.00</DATE>\n" + "<TOPICS></TOPICS>\n" + "<PLACES><D>usa</D></PLACES>\n" + "<PEOPLE></PEOPLE>\n" + "<ORGS></ORGS>\n" + "<EXCHANGES></EXCHANGES>\n" + "<COMPANIES></COMPANIES>\n" + "<UNKNOWN>blah</UNKNOWN>\n" + "<TEXT>\n" + "<TITLE>TITLE 2</TITLE>\n" + "<DATELINE> LOCATION, Date - </DATELINE><BODY>Another example\n" + " Reute\n" + "</BODY></TEXT>\n" + "</REUTERS>\n"; private static Path tmpDir; @BeforeClass public static void beforeClass() throws IOException { tmpDir = Files.createTempDirectory("reuterstest"); Files.write(tmpDir.resolve("file.sgm"), SGML.getBytes(StandardCharsets.UTF_8)); } @AfterClass public static void afterClass() { tmpDir.toFile().delete(); } @Test public void test() throws IOException, UIMAException { BaleenCollectionReader bcr = getCollectionReader(ReutersReader.KEY_PATH, tmpDir.toAbsolutePath().toString()); bcr.initialize(); assertTrue(bcr.doHasNext()); bcr.getNext(jCas.getCas()); //assertEquals("DEV-MUC3-0001 (NOSC)", getSource(jCas)); assertEquals("Some example\ntext.", jCas.getDocumentText()); jCas.reset(); assertTrue(bcr.doHasNext()); bcr.getNext(jCas.getCas()); //assertEquals("DEV-MUC3-0001 (NOSC)", getSource(jCas)); assertEquals("Another example", jCas.getDocumentText()); assertFalse(bcr.doHasNext()); bcr.close(); } }