/** * Copyright (C) 2012 cogroo <cogroo@cogroo.org> * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.cogroo.analyzer; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import opennlp.tools.postag.POSTaggerME; import opennlp.tools.util.Span; import org.apache.log4j.Logger; import org.cogroo.config.Analyzers; import org.cogroo.text.Document; import org.cogroo.text.Sentence; import org.cogroo.text.Token; import org.cogroo.tools.postag.GenderUtil; import org.cogroo.util.EntityUtils; import org.cogroo.util.TextUtils; /** * The <code>POSTagger</code> class analyzes each token of a sentence and * classifies it grammatically. * */ public class POSTagger implements Analyzer { private static final Logger LOGGER = Logger.getLogger(POSTagger.class); private POSTaggerME tagger; public POSTagger(POSTaggerME tagger) { this.tagger = tagger; } public void analyze(Document document) { List<Sentence> sentences = document.getSentences(); for (Sentence sentence : sentences) { List<Token> tokens = sentence.getTokens(); String[] tags; double[] probs; String[][] ac = TextUtils.additionalContext(tokens, Arrays.asList(Analyzers.CONTRACTION_FINDER, Analyzers.NAME_FINDER)); String[] toks = TextUtils.tokensToString(sentence.getTokens()); synchronized (this.tagger) { tags = tagger.tag(toks, ac); probs = tagger.probs(); } double finalProb = computeFinalProb(probs); sentence.setTokensProb(finalProb); if (LOGGER.isDebugEnabled()) { StringBuilder sb = new StringBuilder("Probabilidades do tagger:\n"); for (int i = 0; i < toks.length; i++) { sb.append("[").append(toks[i]).append("_").append(tags[i]) .append(" ").append(probs[i]).append("] "); } LOGGER.debug(sb.toString()); LOGGER.debug("Soma dos logs das probabilidades: " + finalProb); } tags = GenderUtil.removeGender(tags); for (int i = 0; i < tags.length; i++) { tokens.get(i).setPOSTag(tags[i]); tokens.get(i).setPOSTagProb(probs[i]); } EntityUtils.groupTokens(sentence.getText(), tokens, createSpanList(toTokensArray(tokens), toTagsArray(tokens))); mergeHyphenedWords(sentence); } } private double computeFinalProb(double[] probs) { double finalProb = 0; if(true) { for (double prob : probs) { finalProb += Math.log(prob); } } else { for (double prob : probs) { finalProb += prob; } } if(probs.length > 0) { finalProb = finalProb / probs.length; } return finalProb; } private String[] toTokensArray(List<Token> tokens) { String[] arr = new String[tokens.size()]; for (int i = 0; i < tokens.size(); i++) { arr[i] = tokens.get(i).getLexeme(); } return arr; } private String[] toTagsArray(List<Token> tokens) { String[] arr = new String[tokens.size()]; for (int i = 0; i < tokens.size(); i++) { arr[i] = tokens.get(i).getPOSTag(); } return arr; } // this is from opennlp public static List<Span> createSpanList(String[] toks, String[] tags) { // initialize with the list maximum size List<Span> phrases = new ArrayList<Span>(toks.length); String startTag = ""; int startIndex = 0; boolean foundPhrase = false; for (int ci = 0, cn = tags.length; ci < cn; ci++) { String pred = tags[ci]; if(!tags[ci].startsWith("B-") && !tags[ci].startsWith("I-")) { pred = "O"; } if (pred.startsWith("B-") || (!pred.equals("I-" + startTag) && !pred.equals("O"))) { // start if (foundPhrase) { // handle the last phrases.add(new Span(startIndex, ci, startTag)); } startIndex = ci; startTag = pred.substring(2); foundPhrase = true; } else if (pred.equals("I-" + startTag)) { // middle // do nothing } else if (foundPhrase) {// end phrases.add(new Span(startIndex, ci, startTag)); foundPhrase = false; startTag = ""; } } if (foundPhrase) { // leftover phrases.add(new Span(startIndex, tags.length, startTag)); } return phrases; } private void mergeHyphenedWords(Sentence sentence) { List<Token> tokens = sentence.getTokens(); // look for "-", check if it makes contact with the other hyphens boolean restart = true; int start = 1; while (restart) { restart = false; for (int i = start; i < tokens.size() - 1 && !restart; i++) { if ("-".equals(tokens.get(i).getLexeme())) { if (!hasCharacterBetween(tokens.get(i - 1), tokens.get(i)) && !hasCharacterBetween(tokens.get(i), tokens.get(i + 1))) { Token a = tokens.get(i - 1); Token b = tokens.get(i + 1); if (b.getPOSTag().startsWith("pron-")) { // remove the "-" b.setBoundaries(b.getStart() - 1, b.getEnd()); b.setLexeme("-" + b.getLexeme()); tokens.remove(i); restart = true; start = i + 1; } else { // merge the terms String res = merge(a.getPOSTag(), b.getPOSTag()); if(res != null) { String lexeme = a.getLexeme() + "-" + b.getLexeme(); b.setLexeme(lexeme); b.setPOSTag(res); b.setBoundaries(a.getStart(), b.getEnd()); tokens.remove(i); tokens.remove(i - 1); start = i; restart = true; } } } } } } } private String merge(String a, String b) { // http://www.soportugues.com.br/secoes/morf/morf28.php if (isNoun(a) || isNoun(b)) { return "n"; } else if (isNoun(a) && isAdjective(b)) { return "n"; } else if (isVerb(a) && isNoun(b)) { return "n"; } else if (isAdjective(a) && isAdjective(b)) { return "n"; } else if ("prep".equals(b) || "art".equals(b)) { return a; } else if (isVerb(a) && "adv".equals(b)) { return "n"; } else if (isNoun(b)) { return "n"; } else if(a.equals(b)){ return a; } return null; } private boolean isVerb(String a) { return a.startsWith("v-"); } private boolean isNoun(String b) { return "n".equals(b) || "n-adj".equals(b); } private boolean isAdjective(String b) { return "adj".equals(b) || "n-adj".equals(b); } private boolean hasCharacterBetween(Token a, Token b) { int aEnd = a.getEnd(); int bStart = b.getStart(); if (aEnd == bStart) { return false; } return true; } }