MucReader.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.collectionreaders;

import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.util.Arrays;
import java.util.Objects;
import java.util.regex.Pattern;
import java.util.stream.Stream;
import java.util.stream.StreamSupport;

import org.apache.uima.UimaContext;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.jcas.JCas;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.base.Splitter;

import uk.gov.dstl.baleen.collectionreaders.helpers.AbstractStreamCollectionReader;
import uk.gov.dstl.baleen.collectionreaders.helpers.MucEntry;
import uk.gov.dstl.baleen.exceptions.BaleenException;

/**
 * Reads the MUC-3 and MUC-4 datasets.
 * <p>
 * The text is all upper case, which Baleen performs poorly on, it also contains metadata (not just
 * the article text). We lower case the extra and remove excess metadata to create the jCas document
 * text.
 *
 * @baleen.javadoc
 */
public class MucReader extends AbstractStreamCollectionReader<MucEntry> {

	private static final Logger LOGGER = LoggerFactory.getLogger(MucReader.class);

	// This should be \n\n\n but the TST-MUC-11 is different!
	private static final Splitter ARTICLE_SPLITTER = Splitter.on(Pattern.compile("\n\n\\s*\n")).trimResults()
			.omitEmptyStrings();

	/**
	 * Location of the directory containing the muc34 files.
	 *
	 * Note that only files which do not begin with <em>key-</em> will be used.
	 *
	 * @baleen.config
	 */
	public static final String KEY_PATH = "path";
	@ConfigurationParameter(name = KEY_PATH, mandatory = true)
	private String mucPath;

	@Override
	protected Stream<MucEntry> initializeStream(UimaContext context) throws BaleenException {
		final File[] files = checkFilesExist();

		return Arrays.stream(files)
				.flatMap(f -> {
					try {
						final byte[] bytes = Files.readAllBytes(f.toPath());
						return StreamSupport.stream(ARTICLE_SPLITTER.split(new String(bytes, "UTF-8")).spliterator(),
								false);
					} catch (final Exception e) {
						LOGGER.warn("Discarding invalid content of {}", f, e);
						return Stream.empty();
					}
				}).map(text -> {

					final int nlIndex = text.indexOf("\n", 1);
					// Strip the first lines up to a the article start (signified by a --)
					final int textIndex = text.indexOf("--");
					if (nlIndex != -1 && textIndex != -1) {
						final String id = text.substring(0, nlIndex);
						final String content = text.substring(textIndex + 2).trim();
						return new MucEntry(id, content);
					} else {
						return null;
					}
				}).filter(Objects::nonNull)
				.map(e -> {
					e.setText(cleanText(e.getText()));
					return e;
				});
	}
	
	/**
	 * Check whether there are files present (which don't start with key-)
	 */
	public File[] checkFilesExist() throws BaleenException{
		final File[] files = new File(mucPath)
				.listFiles(f -> !f.getName().startsWith("key-") && f.isFile());

		if (files == null || files.length == 0) {
			getMonitor().info("No MUC files found is the path correct: {}", mucPath);
			throw new BaleenException("No MUC files found to process");
		}
		
		return files;
	}
	
	private String cleanText(String text){
		String clean = text.replaceAll("\n", " ");

		// Strip out the clarification tags []
		clean = clean.replaceAll("(\\[.*?\\]\\s*)*", "");
		clean = clean.replaceAll("\\s{3,}", " \n\n");
		clean = clean.toLowerCase().trim();
		// Baleen bug? Lower case U.S. breaks the sentence splitter?
		clean = clean.replaceAll(Pattern.quote("u.s."), "us");
		
		return clean;
	}

	@Override
	protected void apply(MucEntry entry, JCas jCas) {
		jCas.setDocumentLanguage("en");
		jCas.setDocumentText(entry.getText());

		getSupport().getDocumentAnnotation(jCas).setSourceUri(entry.getId());
	}

	@Override
	protected void doClose() throws IOException {
		// Do nothing
	}

}