/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.stanbol.enhancer.engines.poschunker; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.PHRASE_ANNOTATION; import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.POS_ANNOTATION; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.Set; import org.apache.stanbol.enhancer.engines.poschunker.PhraseTypeDefinition.TokenTypeDefinition; import org.apache.stanbol.enhancer.nlp.NlpAnnotations; import org.apache.stanbol.enhancer.nlp.model.Chunk; import org.apache.stanbol.enhancer.nlp.model.Section; import org.apache.stanbol.enhancer.nlp.model.Token; import org.apache.stanbol.enhancer.nlp.model.annotation.Value; import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag; import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory; import org.apache.stanbol.enhancer.nlp.pos.PosTag; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class PhraseBuilder { private final Logger log = LoggerFactory.getLogger(getClass()); /** * Just a fallback in case Pos annotations do not provide probabilities. * In most cases the value of this will not have any effect as typically * Pos Taggers that do not provide probabilities only emit a * single POS tag per Token. In such cases this tag will be always accepted * regardless of the configured value. <p> * The value is only important if some Pos annotation for a Token do have * probabilities while others have not. In such cases those without are rated * against other that have by using this value. Such Situations should only * occur if a chain uses several POS taggers - a setting that should be * avoided<p> */ private static final double DEFAULT_SCORE = 0.1; private final PhraseTypeDefinition phraseType; private final ChunkFactory chunkFactory; private final double minPosSocre; /** * The {@link PhraseTag} added to all {@link Chunk}s created by this * {@link PhraseBuilder} */ private final PhraseTag phraseTag; /** * Holds Tokens of a current phrase. Empty if no phrase is building. */ private List<Token> current = new ArrayList<Token>(); /** * If {@link #current} contains a Tokens matching * {@link PhraseTypeDefinition#getRequiredType()} */ boolean valid; public PhraseBuilder(PhraseTypeDefinition phraseType, ChunkFactory chunkFactory, double minPosSocre) { if(phraseType == null){ throw new IllegalArgumentException("The parsed PhraseTypeDefinition MUST NOT be NULL!"); } this.phraseType = phraseType; log.debug("Create {} for {}",getClass().getSimpleName(),phraseType); this.phraseTag = new PhraseTag(phraseType.getPhraseType().name(), phraseType.getPhraseType()); if(chunkFactory == null){ throw new IllegalArgumentException("The parsed ChunkFactory MUST NOT be NULL"); } this.chunkFactory = chunkFactory; if(minPosSocre < 0 || minPosSocre > 1){ throw new IllegalArgumentException("The parsed minPosScore '" + minPosSocre + "' MUST BE within the ranve [0..1]!"); } this.minPosSocre = minPosSocre; } public void nextToken(Token token){ if(current.isEmpty()){ //check for start checkStart(token); } else if(!checkContinuation(token)){ //check for continuation buildPhrase(token); } } public void nextSection(Section section){ buildPhrase(null); log.debug("-- next {} --", section); } @SuppressWarnings("unchecked") //varargs with generic types private void checkStart(Token token){ boolean[] states = checkCategories(token, phraseType.getStartType(), phraseType.getRequiredType()); if(states[0]){ current.add(token); if(log.isDebugEnabled()) { log.debug("-- {} phrase start --", phraseType.getPhraseType().name()); log.debug(" {}. {} {}", new Object[]{ current.size(), token, logPosCategories(token)}); } valid = states[1]; } else if(log.isTraceEnabled()){ log.trace(" - {} {}", token, logPosCategories(token)); } } @SuppressWarnings("unchecked") //varargs with generic types private boolean checkContinuation(Token token){ final boolean[] states; if(!valid){ //check for prefix types and required types states = checkCategories(token, phraseType.getPrefixType(), phraseType.getRequiredType()); } else { //check for continuation types states = checkCategories(token, phraseType.getContinuationType()); } if(states[0]){ current.add(token); if(log.isDebugEnabled()) { log.debug(" {}. {} {}", new Object[]{ current.size(), token, logPosCategories(token)}); } } if(states.length > 1){ valid = states[1]; } return states[0]; } @SuppressWarnings("unchecked") //varargs with generic types private void buildPhrase(Token token) { Token lastConsumedToken = null; if(valid){ //search backwards for the first token matching an allowed end //category int endIndex = current.size()-1; while(endIndex > 0 && !checkCategories(current.get(endIndex), phraseType.getEndType())[0]){ endIndex--; } lastConsumedToken = current.get(endIndex); //NOTE: ignore phrases with a single token if(endIndex > 0){ Chunk chunk = chunkFactory.createChunk(current.get(0), lastConsumedToken); //TODO: add support for confidence chunk.addAnnotation(PHRASE_ANNOTATION, Value.value(phraseTag)); if(log.isDebugEnabled()){ log.debug(" << add {} phrase {} '{}'", new Object[]{ phraseType.getPhraseType().name(), chunk,chunk.getSpan()}); } } else if(log.isDebugEnabled()){ log.debug(" >> ignore {} phrase with single {} ", phraseType.getPhraseType().name() , current.get(0)); } } else if(!current.isEmpty() && log.isDebugEnabled()){ log.debug(" << ignore invalid {} phrase [{},{}]", new Object[]{ phraseType.getPhraseType().name(), current.get(0).getStart(), current.get(current.size()-1).getEnd()}); } //cleanup current.clear(); valid = false; if(token != null && !token.equals(lastConsumedToken)){ //the current token might be the start of a new phrase checkStart(token); } } /** * Checks if the a the {@link NlpAnnotations#POS_ANNOTATION POS Annotations} * of a {@link Token} matches the parsed categories. This method supports * to check against multiple sets of categories to allow checking e.g. if a token * is suitable for {@link PhraseTypeDefinition#getStartType()} and * {@link PhraseTypeDefinition#getRequiredType()}. * @param token the Token * @param ttd the list of categories to check * @return if the sum of matching annotations compared to the score of all * POS annotations is higher or equals the configured {@link #minPosSocre}. * For each parsed categories set a boolean state is returned. */ private boolean[] checkCategories(Token token, TokenTypeDefinition...ttd) { //there are different ways NLP frameworks do assign scores. For some the //sum of all categories would sum up to 1.0, but as only the top three //categories are included the sum would be < 1 //Others assign scores so that each score is < 1, but the sum of all //is higher as 1.0. //There is also the possibility that no scores are present. //Because of that this sums up all scores and normalizes with the //Match.max(1.0,sumScore). //POS tags without score are assigned a #DEFAULT_SCORE. If not a single //POS tag with a score is present the sumScore is NOT normalized to 1.0 log.trace("> check Categories for {}",token); if(log.isTraceEnabled()){ for(int i = 0; i < ttd.length; i++){ log.trace( "Cat {}: {}",i,ttd[i]); } } boolean scorePresent = false; double sumScore = 0; double[] matchScores = new double[ttd.length]; for(Value<PosTag> pos : token.getAnnotations(POS_ANNOTATION)){ log.trace(" - {}",pos); double score = pos.probability(); if(score == Value.UNKNOWN_PROBABILITY){ score = DEFAULT_SCORE; } else { scorePresent = true; } sumScore = sumScore + score; for(int i = 0; i < ttd.length; i++){ if(ttd[i].matches(pos.value())){ log.trace(" matches Category {} with score {}",i,score); matchScores[i] = matchScores[i] + score; } } } boolean[] matches = new boolean[matchScores.length]; //the score used to normalize annotations. See comments at method start double normScore = scorePresent ? Math.max(1.0,sumScore) : sumScore; for(int i = 0; i < matchScores.length ; i++){ matches[i] = matchScores[i]/normScore >= minPosSocre; } return matches; } /** * used for trace level logging of Tokens part of a chunk * @param token * @return */ private String logPosCategories(Token token){ List<Value<PosTag>> posTags = token.getAnnotations(POS_ANNOTATION); List<String> catNames = new ArrayList<String>(posTags.size()); for(Value<PosTag> tag : posTags){ Set<LexicalCategory> cats = tag.value().getCategories(); if(cats.size() > 1){ catNames.add(cats.toString()); } else if(!cats.isEmpty()){ catNames.add(cats.iterator().next().toString()); } else { catNames.add(tag.value().getTag()); } } return catNames.toString(); } public static interface ChunkFactory { Chunk createChunk(Token start, Token end); } }