OpenNLP.java example

Explorer
lucida-master
- lucida
package info.ephyra.nlp;

import info.ephyra.util.RegexConverter;
import info.ephyra.util.StringUtils;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import opennlp.tools.coref.LinkerMode;
import opennlp.tools.coref.mention.DefaultParse;
import opennlp.tools.coref.mention.Mention;
import opennlp.tools.lang.english.PosTagger;
import opennlp.tools.lang.english.SentenceDetector;
import opennlp.tools.lang.english.Tokenizer;
import opennlp.tools.lang.english.TreebankChunker;
import opennlp.tools.lang.english.TreebankLinker;
import opennlp.tools.lang.english.TreebankParser;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.ParserME;
import opennlp.tools.postag.POSDictionary;

/**
 * <p>This class provides a common interface to the
 * <a href="http://opennlp.sourceforge.net/">OpenNLP</a> toolkit.</p>
 * 
 * <p>It supports the following natural language processing tools:
 * <ul>
 * <li>Sentence detection</li>
 * <li>Tokenization/untokenization</li>
 * <li>Part of speech (POS) tagging</li>
 * <li>Chunking</li>
 * <li>Full parsing</li>
 * <li>Coreference resolution</li>
 * </ul>
 * </p>
 * 
 * @author Nico Schlaefer
 * @version 2006-05-20
 */
public class OpenNLP {
	/** Pattern for abundant blanks. More specific rules come first. T.b.c. */
	private static final Pattern ABUNDANT_BLANKS = Pattern.compile("(" +
		"\\d (st|nd|rd)\\b"			+ "|" +  // 1 st -> 1st
		"[A-Z] \\$"					+ "|" +  // US $ -> US$
		"\\d , \\d\\d\\d\\D"		+ "|" +  // 1 , 000 -> 1,000
		"\\d (\\.|:) \\d"			+ "|" +  // 1 . 99 -> 1.99
		"\\B(\\$|€|¢|£|¥|¤) \\d"	+ "|" +  // $ 100 -> $100
		"\\d (\\$|€|¢|£|¥|¤)"		+ "|" +  // 100 $ -> 100$
		" (-|/) "					+ "|" +  // one - third -> one-third
		"(\\(|\\[|\\{) "			+ "|" +  // ( ... ) -> (... )
		" (\\.|,|:|\\)|\\]|\\})"	+ ")");  // Prof . -> Prof.
	
	/** Sentence detector from the OpenNLP project. */
	private static SentenceDetector sentenceDetector;
	/** Tokenizer from the OpenNLP project. */
	private static Tokenizer tokenizer;
	/** Part of speech tagger from the OpenNLP project. */
	private static PosTagger tagger;
	/** Chunker from the OpenNLP project. */
	private static TreebankChunker chunker;
	/** Full parser from the OpenNLP project. */
	private static ParserME parser;
	/** Linker from the OpenNLP project. */
	private static TreebankLinker linker;
	
	/**
	 * Creates the sentence detector from a model file.
	 * 
	 * @param model model file
	 * @return true, iff the sentence detector was created successfully
	 */
	public static boolean createSentenceDetector(String model) {
		try {
			sentenceDetector = new SentenceDetector(model);
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Creates the tokenizer from a model file.
	 * 
	 * @param model model file
	 * @return true, iff the tokenizer was created successfully
	 */
	public static boolean createTokenizer(String model) {
		try {
			tokenizer = new Tokenizer(model);
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Creates the part of speech tagger from a model file and a case sensitive
	 * tag dictionary.
	 * 
	 * @param model model file
	 * @param tagdict case sensitive tag dictionary
	 * @return true, iff the POS tagger was created successfully
	 */
	public static boolean createPosTagger(String model, String tagdict) {
		try {
			// create POS tagger, use case sensitive tag dictionary
			tagger = new PosTagger(model, new POSDictionary(tagdict, true));
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Creates the chunker from a model file.
	 * 
	 * @param model model file
	 * @return true, iff the chunker was created successfully
	 */
	public static boolean createChunker(String model) {
		try {
			chunker = new TreebankChunker(model);
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Creates the parser from a directory containing models.
	 * 
	 * @param dir model directory
	 * @return true, iff the parser was created successfully
	 */
	public static boolean createParser(String dir) {
		try {
			// create parser, use default beamSize and advancePercentage
			parser = TreebankParser.getParser(dir);
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}

	/**
	 * Creates the linker from a directory containing models.
	 * 
	 * @param dir model directory
	 * @return true, iff the linker was created successfully
	 */
	public static boolean createLinker(String dir) {
		try {
			// create linker that works on unannotated text (TEST mode)
		    linker = new TreebankLinker(dir, LinkerMode.TEST);
		} catch (IOException e) {
			return false;
		}
		
		return true;
	}
	
	/**
	 * Splits a text into sentences.
	 * 
	 * @param text sequence of sentences
	 * @return array of sentences in the text or <code>null</code>, if the
	 * 		   sentence detector is not initialized
	 */
	public static String[] sentDetect(String text) {
		return (sentenceDetector != null)
			? sentenceDetector.sentDetect(text)
			: null;
	}
	
	/**
	 * A model-based tokenizer used to prepare a sentence for POS tagging.
	 * 
	 * @param text text to tokenize
	 * @return array of tokens or <code>null</code>, if the tokenizer is not
	 * 		   initialized
	 */
	public static String[] tokenize(String text) {
		return (tokenizer != null) ? tokenizer.tokenize(text) : null;
	}
	
	/**
	 * Applies the model-based tokenizer and concatenates the tokens with
	 * spaces.
	 * 
	 * @param text text to tokenize
	 * @return string of space-delimited tokens or <code>null</code>, if the
	 * 		   tokenizer is not initialized
	 */
	public static String tokenizeWithSpaces(String text) {
		String[] tokens = tokenize(text);
		return (tokens != null) ? StringUtils.concatWithSpaces(tokens) : null;
	}
	
	/**
	 * <p>Untokenizes a text by removing abundant blanks.</p>
	 * 
	 * <p>Note that it is not guaranteed that this method exactly reverts the
	 * effect of <code>tokenize()</code>.</p>
	 * 
	 * @param text text to untokenize
	 * @return text without abundant blanks
	 */
	public static String untokenize(String text) {
		Matcher m = ABUNDANT_BLANKS.matcher(text);
		while (m.find()) {
			String noBlank = "";
			for (String token : m.group(0).split(" ")) noBlank += token;
			text = text.replace(m.group(0), noBlank);
		}
		return text;
	}
	
	/**
	 * <p>Untokenizes a text by mapping it to a string that contains the
	 * original text as a subsequence.</p>
	 * 
	 * <p>Note that it is not guaranteed that this method exactly reverts the
	 * effect of <code>tokenize()</code>.</p>
	 * 
	 * @param text text to untokenize
	 * @param original string that contains the original text as a subsequence
	 * @return subsequence of the original string or the input text, iff there
	 * 		   is no such subsequence
	 */
	public static String untokenize(String text, String original) {
		// try with boundary matchers
		String regex = RegexConverter.strToRegexWithBounds(text);
		regex = regex.replace(" ", "\\s*+");
		Matcher m = Pattern.compile(regex).matcher(original);
		if (m.find()) return m.group(0);
		
		// try without boundary matchers
		regex = RegexConverter.strToRegex(text);
		regex = regex.replace(" ", "\\s*+");
		m = Pattern.compile(regex).matcher(original);
		if (m.find()) return m.group(0);
		
		// untokenization failed
		return text;
	}
	
	/**
	 * Assigns POS tags to a sentence of space-delimited tokens.
	 * 
	 * @param sentence sentence to be annotated with POS tags
	 * @return tagged sentence or <code>null</code>, if the tagger is not
	 * 		   initialized
	 */
	public static String tagPos(String sentence) {
		return (tagger != null) ? tagger.tag(sentence) : null;
	}
	
	/**
	 * Assigns POS tags to an array of tokens that form a sentence.
	 * 
	 * @param sentence array of tokens to be annotated with POS tags
	 * @return array of POS tags or <code>null</code>, if the tagger is not
	 * 		   initialized
	 */
	public static String[] tagPos(String[] sentence) {
		return (tagger != null) ? tagger.tag(sentence) : null;
	}
	
	/**
	 * Assigns chunk tags to an array of tokens and POS tags.
	 * 
	 * @param tokens array of tokens
	 * @param pos array of corresponding POS tags
	 * @return array of chunk tags or <code>null</code>, if the chunker is not
	 * 		   initialized
	 */
	public static String[] tagChunks(String[] tokens, String[] pos) {
		return (chunker != null) ? chunker.chunk(tokens, pos) : null;
	}
	
	/**
	 * Peforms a full parsing on a sentence of space-delimited tokens.
	 * 
	 * @param sentence the sentence
	 * @return parse of the sentence or <code>null</code>, if the parser is not
	 * 		   initialized or the sentence is empty
	 */
	public static Parse parse(String sentence) {
		return (parser != null && sentence.length() > 0)
			// only get first parse (that is most likely to be correct)
			? TreebankParser.parseLine(sentence, parser, 1)[0]
			: null;
	}
	
	/**
	 * Identifies coreferences in an array of full parses of sentences.
	 * 
	 * @param parses array of full parses of sentences
	 */
	public static void link(Parse[] parses) {
		int sentenceNumber = 0;
		List<Mention> document = new ArrayList<Mention>();
		
		for (Parse parse : parses) {
			DefaultParse dp = new DefaultParse(parse, sentenceNumber);
			Mention[] extents =	linker.getMentionFinder().getMentions(dp);
			
			//construct new parses for mentions which do not have constituents
			for (int i = 0; i < extents.length; i++)
				if (extents[i].getParse() == null) {
					Parse snp = new Parse(parse.getText(), extents[i].getSpan(),
										  "NML", 1.0);
					parse.insert(snp);
					extents[i].setParse(new DefaultParse(snp,sentenceNumber));
				}
			
			document.addAll(Arrays.asList(extents));
			sentenceNumber++;
	    }
		
		if (document.size() > 0) {
//			Mention[] ms = document.toArray(new Mention[document.size()]);
//			DiscourseEntity[] entities = linker.getEntities(ms);
//			TODO return results in an appropriate data structure
		}
	}
	
	private static HashSet<String> unJoinablePrepositions = new HashSet<String>(); 
	static {
		unJoinablePrepositions.add("that");
		unJoinablePrepositions.add("than");
		unJoinablePrepositions.add("which");
		unJoinablePrepositions.add("whose");
		unJoinablePrepositions.add("if");
		unJoinablePrepositions.add("such");
		unJoinablePrepositions.add("whether");
		unJoinablePrepositions.add("when");
		unJoinablePrepositions.add("where");
		unJoinablePrepositions.add("who");
	}
	
	public static String[] joinNounPhrases(String[] tokens, String[] chunkTags) {
		if (chunkTags.length < 2) return chunkTags;
		
		String[] newChunkTags = new String[chunkTags.length];
		newChunkTags[0] = chunkTags[0];
		
		for (int t = 1; t < chunkTags.length; t++) {
			if ("B-NP".equals(chunkTags[t]) && ("B-NP".equals(chunkTags[t - 1]) || "I-NP".equals(chunkTags[t - 1]))) {
				newChunkTags[t] = "I-NP";
			} else if ((t != 1) && "B-NP".equals(chunkTags[t]) && "B-PP".equals(chunkTags[t - 1]) && !unJoinablePrepositions.contains(tokens[t-1]) && ("B-NP".equals(chunkTags[t - 2]) || "I-NP".equals(chunkTags[t - 2]))) {
				newChunkTags[t - 1] = "I-NP";
				newChunkTags[t] = "I-NP";
			} else newChunkTags[t] = chunkTags[t];
		}
		
		return newChunkTags;
	}
}