PatternExtract.java example

Explorer
baleen-master
//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.patterns.data;

import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;

import uk.gov.dstl.baleen.types.language.WordToken;
import uk.gov.dstl.baleen.types.semantic.Entity;

/**
 * A java bean corresponding to an extracted Pattern.
 *
 * A pattern is a range of text between two entities.
 */
public final class PatternExtract {

	/** The start. */
	private final int start;

	/** The end. */
	private final int end;

	/** The from. */
	private final Entity from;

	/** The to. */
	private final Entity to;

	/** The words. */
	private List<WordToken> words;

	/**
	 * Instantiates a new pattern extract.
	 *
	 * @param from
	 *            the first entity (start of the pattern)
	 * @param to
	 *            the second entity (end of the pattern)
	 * @param start
	 *            the start index
	 * @param end
	 *            the end index
	 */
	public PatternExtract(final Entity from, final Entity to, final int start, final int end) {
		this.from = from;
		this.to = to;
		this.start = start;
		this.end = end;
	}

	/**
	 * Get the first entity.
	 *
	 * @return entity
	 */
	public Entity getFrom() {
		return from;
	}

	/**
	 * Gets the second entitys
	 *
	 * @return entity
	 */
	public Entity getTo() {
		return to;
	}

	/**
	 * Gets the start.
	 *
	 * @return the start
	 */
	public int getStart() {
		return start;
	}

	/**
	 * Gets the end.
	 *
	 * @return the end
	 */
	public int getEnd() {
		return end;
	}

	/**
	 * Sets the word tokens (which form the pattern, and are beneath the start-end range).
	 *
	 * @param words
	 *            the new word tokens
	 */
	public void setWordTokens(final List<WordToken> words) {
		this.words = words;
	}

	/**
	 * Gets the word tokens (must have been previously set)
	 *
	 * @return the word tokens
	 */
	public List<WordToken> getWordTokens() {
		return words;
	}

	/**
	 * Determine if any of the needles are contained in this covering document text.
	 *
	 * @param documentText
	 *            the document text
	 * @param needles
	 *            the needles
	 * @return true, if successful
	 */
	public boolean contains(final String documentText, final String... needles) {
		final String text = getCoveredText(documentText);
		return Arrays.stream(needles).anyMatch(text::contains);
	}

	/**
	 * Gets the covered text.
	 *
	 * @param documentText
	 *            the document text
	 * @return the covered text
	 */
	public String getCoveredText(final String documentText) {
		return documentText.substring(start, end);
	}

	/**
	 * Gets the text formed of the concatenated word tokens.
	 *
	 * Hence this a 'sanitised text' rather than the covered text.
	 *
	 * @return the text
	 */
	public String getText() {
		if (words == null) {
			return "";
		}

		return words.stream()
				.map(w -> w.getCoveredText()).collect(Collectors.joining(" "));
	}

	/**
	 * Checks if is empty, based on the word tokens (not range, start/end, etc)
	 *
	 * @return true, if is empty
	 */
	public boolean isEmpty() {
		return words == null || words.isEmpty();
	}

}