/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.enhancer.engines.poschunker;
import java.util.Collections;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Set;
import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.nlp.pos.Pos;
import org.apache.stanbol.enhancer.nlp.pos.PosTag;
/**
* Definition of a phrase type<p>
*
* Phrases are defined by a set of POS tags that can <ul>
* <li> required Tokens - typically noun for noun phrases, verbs for verb phrases.
* <li> start types - types that can start a new phrase
* <li> prefix types - types that can continue a phrase not yet containing a
* required token
* <li> continuation types - types that can continue a phrase already containing
* a required token
* <li> end types - types that can end a phrase. Used to remove tailing tokens
* from a phrase (typically punctations).
* </ul>
*
* <b>TODO:</b> Add support for {@link Pos} and String tags in addition to
* {@link LexicalCategory}.
*
* @author Rupert Westenthaler
*
*/
public class PhraseTypeDefinition {
protected final LexicalCategory phraseType;
private final TokenTypeDefinition startTypeDefinition;
private final TokenTypeDefinition prefixTypeDefinition;
private final TokenTypeDefinition continuationTypeDefinition;
private final TokenTypeDefinition requiredTypeDefinition;
private final TokenTypeDefinition endTypeDefinition;
public PhraseTypeDefinition(LexicalCategory phraseType) {
if(phraseType == null){
throw new IllegalArgumentException("The parsed PhraseType MUST NOT be NULL!");
}
this.phraseType = phraseType;
startTypeDefinition = new TokenTypeDefinition(phraseType);
prefixTypeDefinition = new TokenTypeDefinition(phraseType);
continuationTypeDefinition = new TokenTypeDefinition(phraseType);
requiredTypeDefinition = new TokenTypeDefinition(phraseType);
endTypeDefinition = new TokenTypeDefinition(phraseType);
}
/**
* Getter for the type of this phrase definition
* @return
*/
public LexicalCategory getPhraseType(){
return phraseType;
}
/**
* Getter for the read only set with the start types.
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can start a phrase of that type
*/
public TokenTypeDefinition getStartType(){
return startTypeDefinition;
}
/**
* Getter for the read only set with the prefix types
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can continue a phrase that does not yet include a token classified
* with a {@link #getRequiredType() required type}. A typical Example are
* {@link LexicalCategory#Adjective} in Noun Phrases that need to be
* considered in prefixes (e.g. "A nice weekend") but excluded after the
* first noun (e.g. "the trip last week").
*/
public TokenTypeDefinition getPrefixType(){
return prefixTypeDefinition;
}
/**
* Getter for the read only set with the continuation types
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can continue a phrase that does already include a token classified
* with a {@link #getRequiredType() required type}. A typical Example are
* {@link LexicalCategory#Adjective} in Noun Phrases that need to be
* considered in prefixes (e.g. "A nice weekend") but excluded after the
* first noun (e.g. "the trip last week").
*/
public TokenTypeDefinition getContinuationType(){
return continuationTypeDefinition;
}
/**
* Getter for the read only set with the required types
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that MUST occur within a phrase of that type
*/
public TokenTypeDefinition getRequiredType(){
return requiredTypeDefinition;
}
/**
* Getter for the read only set with the end types.
* @return the read only set with {@link LexicalCategory LexicalCategories}
* that can end a phrase of that type
*/
public TokenTypeDefinition getEndType(){
return endTypeDefinition;
}
@Override
public String toString() {
return phraseType.name();
}
public static class TokenTypeDefinition {
private final Set<LexicalCategory> categories = EnumSet.noneOf(LexicalCategory.class);
private Set<Pos> posTags = EnumSet.noneOf(Pos.class);
private Set<Pos> excludedPosTags = EnumSet.noneOf(Pos.class);
private Set<String> tags = new HashSet<String>();
/**
* Used by the constructor of the {@link PhraseTypeDefinition} class
* @param lc
*/
private TokenTypeDefinition(LexicalCategory lc){
this(Collections.singleton(lc),null);
}
public TokenTypeDefinition(Set<LexicalCategory> categories, Set<Pos> posTags, String...tags) {
if(categories != null){
for(LexicalCategory lc : categories){
if(lc != null){
this.categories.add(lc);
}
}
}
if(posTags != null){
for(Pos pos : posTags){
if(pos != null){
this.posTags.add(pos);
}
}
}
if(tags != null){
for(String tag : tags){
if(tag != null){
this.tags.add(tag);
}
}
}
}
/**
* Read-/writeable set of {@link LexicalCategory LexicalCategories}
* @return the set of lexical categories
*/
public Set<LexicalCategory> getCategories() {
return categories;
}
/**
* Adds the parsed {@link LexicalCategory LexicalCategories}
* @param categories the LexicalCategories
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean addCategories(LexicalCategory...categories){
return add(this.categories, categories);
}
/**
* Removes the parsed {@link LexicalCategory LexicalCategories}
* @param categories the LexicalCategories
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean removeCategories(LexicalCategory...categories){
return remove(this.categories, categories);
}
/**
* Read-/writeable set of {@link Pos} tags
* @return the set of POS tags
*/
public Set<Pos> getPosTags() {
return posTags;
}
/**
* Adds the parsed {@link Pos} tags
* @param pos the {@link Pos} tags
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean addPosTags(Pos...pos){
return add(this.posTags, pos);
}
/**
* Removes the parsed {@link Pos} tags
* @param pos the {@link Pos} tags
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean removePosTags(Pos...pos){
return remove(this.posTags, pos);
}
/**
* Read-/writeable set of excluded {@link Pos} tags. This allows to
* include a {@link LexicalCategory} but to exclude some specific
* {@link Pos} member of this category.
* @return the set of excluded POS tags
*/
public Set<Pos> getExcludedPosTags() {
return excludedPosTags;
}
/**
* Adds the parsed {@link Pos} tags to the set of excluded {@link Pos} tags
* @param pos the {@link Pos} tags
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean addExcludedPosTags(Pos...pos){
return add(this.excludedPosTags, pos);
}
/**
* Removes the parsed {@link Pos} tags to the set of excluded {@link Pos} tags
* @param pos the {@link Pos} tags
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean removeExcludedPosTags(Pos...pos){
return remove(this.excludedPosTags, pos);
}
/**
* Read-/writeable set of string tags (as provided by the POS tagger)
* @return the set of String tags
*/
public Set<String> getTags() {
return tags;
}
/**
* Adds the parsed tags
* @param tag the tags
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean addTags(String...tag){
return add(this.tags, tag);
}
/**
* Removes the parsed tags
* @param tag the tags
* @return if the {@link TokenTypeDefinition} was updated by this operation
*/
public boolean removeTags(String...tag){
return remove(this.tags, tag);
}
/**
* Checks if a posTag matches against this TokenTypeDefinition
* @param posTag the posTag to check
* @return <code>true</code> in case of a match. Otherwise <code>false</code>
* @throws NullPointerException if the parsed posTag is <code>null</code>
*/
public boolean matches(PosTag posTag){
//check against incldues categories, posTags and tags
boolean matches =
(!Collections.disjoint(posTag.getCategories(), categories)) ||
(!Collections.disjoint(posTag.getPosHierarchy(), posTags)) ||
tags.contains(posTag.getTag());
//if there is a match we need still to check for excluded POS tags
return matches ? Collections.disjoint(posTag.getPosHierarchy(),excludedPosTags) :
false;
}
private <T> boolean add(Set<T> set, T...types){
boolean changed = false;
if(types != null){
for(T type : types){
if(type != null){
if(set.add(type)){
changed = true;
}
}
}
}
return changed;
}
private <T> boolean remove(Set<T> set, T...types){
boolean changed = false;
if(types != null){
for(T type : types){
if(type != null){
if(set.remove(type)){
changed = true;
}
}
}
}
return changed;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
if(!categories.isEmpty()){
sb.append("Cat: ");
boolean first = true;
for(LexicalCategory lc : categories){
if(first){
first = false;
} else {
sb.append(", ");
}
sb.append(lc.name());
}
}
if(!posTags.isEmpty()){
if(sb.length() > 0){
sb.append(" | ");
}
sb.append("Pos: ");
boolean first = true;
for(Pos pos : posTags){
if(first){
first = false;
} else {
sb.append(", ");
}
sb.append(pos.name());
}
}
if(!tags.isEmpty()){
if(sb.length() > 0){
sb.append(" | ");
}
sb.append("Tags: ");
boolean first = true;
for(String tag : tags){
if(first){
first = false;
} else {
sb.append(", ");
}
sb.append(tag);
}
}
if(!excludedPosTags.isEmpty()){
if(sb.length() > 0){
sb.append(" | ");
}
sb.append("Excluded: ");
boolean first = true;
for(Pos pos : excludedPosTags){
if(first){
first = false;
} else {
sb.append(", ");
}
sb.append(pos.name());
}
}
return sb.toString();
}
}
}