//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.collectionreaders; import java.io.File; import java.io.IOException; import java.nio.file.Files; import java.util.Arrays; import java.util.Objects; import java.util.regex.Pattern; import java.util.stream.Stream; import java.util.stream.StreamSupport; import org.apache.uima.UimaContext; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.jcas.JCas; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.base.Splitter; import uk.gov.dstl.baleen.collectionreaders.helpers.AbstractStreamCollectionReader; import uk.gov.dstl.baleen.collectionreaders.helpers.MucEntry; import uk.gov.dstl.baleen.exceptions.BaleenException; /** * Reads the MUC-3 and MUC-4 datasets. * <p> * The text is all upper case, which Baleen performs poorly on, it also contains metadata (not just * the article text). We lower case the extra and remove excess metadata to create the jCas document * text. * * @baleen.javadoc */ public class MucReader extends AbstractStreamCollectionReader<MucEntry> { private static final Logger LOGGER = LoggerFactory.getLogger(MucReader.class); // This should be \n\n\n but the TST-MUC-11 is different! private static final Splitter ARTICLE_SPLITTER = Splitter.on(Pattern.compile("\n\n\\s*\n")).trimResults() .omitEmptyStrings(); /** * Location of the directory containing the muc34 files. * * Note that only files which do not begin with <em>key-</em> will be used. * * @baleen.config */ public static final String KEY_PATH = "path"; @ConfigurationParameter(name = KEY_PATH, mandatory = true) private String mucPath; @Override protected Stream<MucEntry> initializeStream(UimaContext context) throws BaleenException { final File[] files = checkFilesExist(); return Arrays.stream(files) .flatMap(f -> { try { final byte[] bytes = Files.readAllBytes(f.toPath()); return StreamSupport.stream(ARTICLE_SPLITTER.split(new String(bytes, "UTF-8")).spliterator(), false); } catch (final Exception e) { LOGGER.warn("Discarding invalid content of {}", f, e); return Stream.empty(); } }).map(text -> { final int nlIndex = text.indexOf("\n", 1); // Strip the first lines up to a the article start (signified by a --) final int textIndex = text.indexOf("--"); if (nlIndex != -1 && textIndex != -1) { final String id = text.substring(0, nlIndex); final String content = text.substring(textIndex + 2).trim(); return new MucEntry(id, content); } else { return null; } }).filter(Objects::nonNull) .map(e -> { e.setText(cleanText(e.getText())); return e; }); } /** * Check whether there are files present (which don't start with key-) */ public File[] checkFilesExist() throws BaleenException{ final File[] files = new File(mucPath) .listFiles(f -> !f.getName().startsWith("key-") && f.isFile()); if (files == null || files.length == 0) { getMonitor().info("No MUC files found is the path correct: {}", mucPath); throw new BaleenException("No MUC files found to process"); } return files; } private String cleanText(String text){ String clean = text.replaceAll("\n", " "); // Strip out the clarification tags [] clean = clean.replaceAll("(\\[.*?\\]\\s*)*", ""); clean = clean.replaceAll("\\s{3,}", " \n\n"); clean = clean.toLowerCase().trim(); // Baleen bug? Lower case U.S. breaks the sentence splitter? clean = clean.replaceAll(Pattern.quote("u.s."), "us"); return clean; } @Override protected void apply(MucEntry entry, JCas jCas) { jCas.setDocumentLanguage("en"); jCas.setDocumentText(entry.getText()); getSupport().getDocumentAnnotation(jCas).setSourceUri(entry.getId()); } @Override protected void doClose() throws IOException { // Do nothing } }