package info.ephyra.nlp; import info.ephyra.util.RegexConverter; import info.ephyra.util.StringUtils; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import opennlp.tools.coref.LinkerMode; import opennlp.tools.coref.mention.DefaultParse; import opennlp.tools.coref.mention.Mention; import opennlp.tools.lang.english.PosTagger; import opennlp.tools.lang.english.SentenceDetector; import opennlp.tools.lang.english.Tokenizer; import opennlp.tools.lang.english.TreebankChunker; import opennlp.tools.lang.english.TreebankLinker; import opennlp.tools.lang.english.TreebankParser; import opennlp.tools.parser.Parse; import opennlp.tools.parser.ParserME; import opennlp.tools.postag.POSDictionary; /** * <p>This class provides a common interface to the * <a href="http://opennlp.sourceforge.net/">OpenNLP</a> toolkit.</p> * * <p>It supports the following natural language processing tools: * <ul> * <li>Sentence detection</li> * <li>Tokenization/untokenization</li> * <li>Part of speech (POS) tagging</li> * <li>Chunking</li> * <li>Full parsing</li> * <li>Coreference resolution</li> * </ul> * </p> * * @author Nico Schlaefer * @version 2006-05-20 */ public class OpenNLP { /** Pattern for abundant blanks. More specific rules come first. T.b.c. */ private static final Pattern ABUNDANT_BLANKS = Pattern.compile("(" + "\\d (st|nd|rd)\\b" + "|" + // 1 st -> 1st "[A-Z] \\$" + "|" + // US $ -> US$ "\\d , \\d\\d\\d\\D" + "|" + // 1 , 000 -> 1,000 "\\d (\\.|:) \\d" + "|" + // 1 . 99 -> 1.99 "\\B(\\$|€|¢|£|¥|¤) \\d" + "|" + // $ 100 -> $100 "\\d (\\$|€|¢|£|¥|¤)" + "|" + // 100 $ -> 100$ " (-|/) " + "|" + // one - third -> one-third "(\\(|\\[|\\{) " + "|" + // ( ... ) -> (... ) " (\\.|,|:|\\)|\\]|\\})" + ")"); // Prof . -> Prof. /** Sentence detector from the OpenNLP project. */ private static SentenceDetector sentenceDetector; /** Tokenizer from the OpenNLP project. */ private static Tokenizer tokenizer; /** Part of speech tagger from the OpenNLP project. */ private static PosTagger tagger; /** Chunker from the OpenNLP project. */ private static TreebankChunker chunker; /** Full parser from the OpenNLP project. */ private static ParserME parser; /** Linker from the OpenNLP project. */ private static TreebankLinker linker; /** * Creates the sentence detector from a model file. * * @param model model file * @return true, iff the sentence detector was created successfully */ public static boolean createSentenceDetector(String model) { try { sentenceDetector = new SentenceDetector(model); } catch (IOException e) { return false; } return true; } /** * Creates the tokenizer from a model file. * * @param model model file * @return true, iff the tokenizer was created successfully */ public static boolean createTokenizer(String model) { try { tokenizer = new Tokenizer(model); } catch (IOException e) { return false; } return true; } /** * Creates the part of speech tagger from a model file and a case sensitive * tag dictionary. * * @param model model file * @param tagdict case sensitive tag dictionary * @return true, iff the POS tagger was created successfully */ public static boolean createPosTagger(String model, String tagdict) { try { // create POS tagger, use case sensitive tag dictionary tagger = new PosTagger(model, new POSDictionary(tagdict, true)); } catch (IOException e) { return false; } return true; } /** * Creates the chunker from a model file. * * @param model model file * @return true, iff the chunker was created successfully */ public static boolean createChunker(String model) { try { chunker = new TreebankChunker(model); } catch (IOException e) { return false; } return true; } /** * Creates the parser from a directory containing models. * * @param dir model directory * @return true, iff the parser was created successfully */ public static boolean createParser(String dir) { try { // create parser, use default beamSize and advancePercentage parser = TreebankParser.getParser(dir); } catch (IOException e) { return false; } return true; } /** * Creates the linker from a directory containing models. * * @param dir model directory * @return true, iff the linker was created successfully */ public static boolean createLinker(String dir) { try { // create linker that works on unannotated text (TEST mode) linker = new TreebankLinker(dir, LinkerMode.TEST); } catch (IOException e) { return false; } return true; } /** * Splits a text into sentences. * * @param text sequence of sentences * @return array of sentences in the text or <code>null</code>, if the * sentence detector is not initialized */ public static String[] sentDetect(String text) { return (sentenceDetector != null) ? sentenceDetector.sentDetect(text) : null; } /** * A model-based tokenizer used to prepare a sentence for POS tagging. * * @param text text to tokenize * @return array of tokens or <code>null</code>, if the tokenizer is not * initialized */ public static String[] tokenize(String text) { return (tokenizer != null) ? tokenizer.tokenize(text) : null; } /** * Applies the model-based tokenizer and concatenates the tokens with * spaces. * * @param text text to tokenize * @return string of space-delimited tokens or <code>null</code>, if the * tokenizer is not initialized */ public static String tokenizeWithSpaces(String text) { String[] tokens = tokenize(text); return (tokens != null) ? StringUtils.concatWithSpaces(tokens) : null; } /** * <p>Untokenizes a text by removing abundant blanks.</p> * * <p>Note that it is not guaranteed that this method exactly reverts the * effect of <code>tokenize()</code>.</p> * * @param text text to untokenize * @return text without abundant blanks */ public static String untokenize(String text) { Matcher m = ABUNDANT_BLANKS.matcher(text); while (m.find()) { String noBlank = ""; for (String token : m.group(0).split(" ")) noBlank += token; text = text.replace(m.group(0), noBlank); } return text; } /** * <p>Untokenizes a text by mapping it to a string that contains the * original text as a subsequence.</p> * * <p>Note that it is not guaranteed that this method exactly reverts the * effect of <code>tokenize()</code>.</p> * * @param text text to untokenize * @param original string that contains the original text as a subsequence * @return subsequence of the original string or the input text, iff there * is no such subsequence */ public static String untokenize(String text, String original) { // try with boundary matchers String regex = RegexConverter.strToRegexWithBounds(text); regex = regex.replace(" ", "\\s*+"); Matcher m = Pattern.compile(regex).matcher(original); if (m.find()) return m.group(0); // try without boundary matchers regex = RegexConverter.strToRegex(text); regex = regex.replace(" ", "\\s*+"); m = Pattern.compile(regex).matcher(original); if (m.find()) return m.group(0); // untokenization failed return text; } /** * Assigns POS tags to a sentence of space-delimited tokens. * * @param sentence sentence to be annotated with POS tags * @return tagged sentence or <code>null</code>, if the tagger is not * initialized */ public static String tagPos(String sentence) { return (tagger != null) ? tagger.tag(sentence) : null; } /** * Assigns POS tags to an array of tokens that form a sentence. * * @param sentence array of tokens to be annotated with POS tags * @return array of POS tags or <code>null</code>, if the tagger is not * initialized */ public static String[] tagPos(String[] sentence) { return (tagger != null) ? tagger.tag(sentence) : null; } /** * Assigns chunk tags to an array of tokens and POS tags. * * @param tokens array of tokens * @param pos array of corresponding POS tags * @return array of chunk tags or <code>null</code>, if the chunker is not * initialized */ public static String[] tagChunks(String[] tokens, String[] pos) { return (chunker != null) ? chunker.chunk(tokens, pos) : null; } /** * Peforms a full parsing on a sentence of space-delimited tokens. * * @param sentence the sentence * @return parse of the sentence or <code>null</code>, if the parser is not * initialized or the sentence is empty */ public static Parse parse(String sentence) { return (parser != null && sentence.length() > 0) // only get first parse (that is most likely to be correct) ? TreebankParser.parseLine(sentence, parser, 1)[0] : null; } /** * Identifies coreferences in an array of full parses of sentences. * * @param parses array of full parses of sentences */ public static void link(Parse[] parses) { int sentenceNumber = 0; List<Mention> document = new ArrayList<Mention>(); for (Parse parse : parses) { DefaultParse dp = new DefaultParse(parse, sentenceNumber); Mention[] extents = linker.getMentionFinder().getMentions(dp); //construct new parses for mentions which do not have constituents for (int i = 0; i < extents.length; i++) if (extents[i].getParse() == null) { Parse snp = new Parse(parse.getText(), extents[i].getSpan(), "NML", 1.0); parse.insert(snp); extents[i].setParse(new DefaultParse(snp,sentenceNumber)); } document.addAll(Arrays.asList(extents)); sentenceNumber++; } if (document.size() > 0) { // Mention[] ms = document.toArray(new Mention[document.size()]); // DiscourseEntity[] entities = linker.getEntities(ms); // TODO return results in an appropriate data structure } } private static HashSet<String> unJoinablePrepositions = new HashSet<String>(); static { unJoinablePrepositions.add("that"); unJoinablePrepositions.add("than"); unJoinablePrepositions.add("which"); unJoinablePrepositions.add("whose"); unJoinablePrepositions.add("if"); unJoinablePrepositions.add("such"); unJoinablePrepositions.add("whether"); unJoinablePrepositions.add("when"); unJoinablePrepositions.add("where"); unJoinablePrepositions.add("who"); } public static String[] joinNounPhrases(String[] tokens, String[] chunkTags) { if (chunkTags.length < 2) return chunkTags; String[] newChunkTags = new String[chunkTags.length]; newChunkTags[0] = chunkTags[0]; for (int t = 1; t < chunkTags.length; t++) { if ("B-NP".equals(chunkTags[t]) && ("B-NP".equals(chunkTags[t - 1]) || "I-NP".equals(chunkTags[t - 1]))) { newChunkTags[t] = "I-NP"; } else if ((t != 1) && "B-NP".equals(chunkTags[t]) && "B-PP".equals(chunkTags[t - 1]) && !unJoinablePrepositions.contains(tokens[t-1]) && ("B-NP".equals(chunkTags[t - 2]) || "I-NP".equals(chunkTags[t - 2]))) { newChunkTags[t - 1] = "I-NP"; newChunkTags[t] = "I-NP"; } else newChunkTags[t] = chunkTags[t]; } return newChunkTags; } }