//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.collectionreaders; import java.io.ByteArrayInputStream; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.stream.Stream; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import org.apache.uima.UimaContext; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import uk.gov.dstl.baleen.collectionreaders.helpers.AbstractStreamCollectionReader; import uk.gov.dstl.baleen.exceptions.BaleenException; /** * A collection reader which loads SGM files from the Reuters21578 archive. * Archive must be extracted prior to use. * <p> * Available for download at http://www.daviddlewis.com/resources/testcollections/reuters21578/ * * @baleen.javadoc */ public class ReutersReader extends AbstractStreamCollectionReader<String> { /** * Location of the directory containing the sgm files. * * @baleen.resource String */ public static final String KEY_PATH = "path"; @ConfigurationParameter(name = KEY_PATH, mandatory = true) private String sgmPath; public ReutersReader() { // Do nothing } @Override protected Stream<String> initializeStream(UimaContext context) throws BaleenException { final File[] files = new File(sgmPath) .listFiles(f -> f.getName().endsWith(".sgm") && f.isFile()); DocumentBuilder documentBuilder; try { final DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); documentBuilder = factory.newDocumentBuilder(); } catch (final Exception e) { throw new BaleenException(e); } return Arrays.stream(files) .flatMap(sgmlFile -> fileToStream(sgmlFile, documentBuilder)) .flatMap(e -> nodeListToText(e.getElementsByTagName("BODY"))) .filter(s -> !s.isEmpty()); } private Stream<Element> fileToStream(File sgmlFile, DocumentBuilder documentBuilder){ try { final byte[] bytes = Files.readAllBytes(sgmlFile.toPath()); final String sgml = new String(bytes, "UTF-8"); // Remove the <!DOCTYPE lewis SYSTEM "lewis.dtd"> // Then add a root element String xml = "<root>" + sgml.substring("<!DOCTYPE lewis SYSTEM \"lewis.dtd\">".length()) + "</root>"; // Remove the xml = xml.replaceAll("&#\\d+;", ""); final ByteArrayInputStream input = new ByteArrayInputStream(xml.getBytes("UTF-8")); final Document doc = documentBuilder.parse(input); final NodeList reutersDocument = doc.getElementsByTagName("REUTERS"); return nodeListToElements(reutersDocument); } catch (final Exception e) { getMonitor().warn("Unable to process SGML file {}", sgmlFile.getAbsolutePath(), e); } return Stream.<Element>empty(); } private Stream<String> nodeListToText(final NodeList list) { final List<String> elements = new ArrayList<>(list.getLength()); for (int i = 0; i < list.getLength(); i++) { final Node n = list.item(i); String text = n.getTextContent(); text = text.replaceAll("Reuter?\\s*$", ""); elements.add(text.trim()); } return elements.stream(); } private Stream<Element> nodeListToElements(final NodeList list) { final List<Element> elements = new ArrayList<>(list.getLength()); for (int i = 0; i < list.getLength(); i++) { final Node n = list.item(i); if (n.getNodeType() == Element.ELEMENT_NODE) { elements.add((Element) n); } } return elements.stream(); } @Override protected void apply(String text, JCas jCas) { jCas.setDocumentLanguage("en"); jCas.setDocumentText(text); } @Override protected void doClose() throws IOException { // Do nothing } }