/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.stanbol.commons.opennlp;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import opennlp.tools.util.Span;
/**
* Simple version of a {@link opennlp.tools.chunker.Chunker} that uses the POS tags to build chunks.
* It does not implement the {@link opennlp.tools.chunker.Chunker} interface because implementing
* methods other than the {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])}
* is not feasible.<p>
* Defaults are based on the <a href="http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html">
* Penn Treebank</a> tag set
*
* TODO: <ul>
* <li> Test if POS tags are the same for different languages
* <li> Check if it is possible to implement the {@link opennlp.tools.chunker.Chunker} interface
* </ul>
* @author Rupert Westenthaler
* @deprecated replaced by STANBOL-733 (stanbol nlp processing module
*
*/
public class PosTypeChunker {
private final double minPosProb;
private final Set<String> followTypes;
private final Set<String> buildTypes;
/**
* Creates an instance for the given language based on the configuration
* within the {@link PosTagsCollectionEnum}.
* @param lang The language
* @param minPosTagProbaility The minimum probability of a POS tag so that
* it is processed. In case of lower Probabilities POS tags are ignored and
* assumed to be matching.
* @return the instance or <code>null</code> if no configuration for the
* parsed language is present in the {@link PosTagsCollectionEnum}.
*/
public static PosTypeChunker getInstance(String lang,double minPosTagProbaility){
Set<String> nounPosTagCollection =
PosTagsCollectionEnum.getPosTagCollection(lang, PosTypeCollectionType.NOUN);
if(nounPosTagCollection != null && !nounPosTagCollection.isEmpty()){
return new PosTypeChunker(nounPosTagCollection,
PosTagsCollectionEnum.getPosTagCollection(
lang,PosTypeCollectionType.FOLLOW),minPosTagProbaility);
} else {
return null;
}
}
/**
* Initialise a new PosTypeChunker for the parsed POS tag collections. This
* Constructor can be used if no predefined Configuration for a given
* language is available in the {@link PosTagsCollectionEnum}<p>
* Note that buildPosTypes are added to the followed once. Therefore the
* followPosTypes may or may not include some/all buildPosTypes.
* @param buildPosTypes the POS types that trigger a new Chunk (MUST NOT be
* <code>null</code> nor {@link Set#isEmpty() empty}).
* @param followPosTypes additional POS types followed to extend Chunks (MAY
* BE <code>null</code> or empty).
*/
public PosTypeChunker(Set<String> buildPosTypes,Set<String> followPosTypes,double minPosProb){
if(buildPosTypes == null || buildPosTypes.isEmpty()){
throw new IllegalArgumentException("The set of POS types used to" +
"build Chunks MUST NOT be NULL nor empty!");
}
this.buildTypes = Collections.unmodifiableSet(new TreeSet<String>(buildPosTypes));
Set<String> follow = new TreeSet<String>();
follow.addAll(buildTypes);
if(followPosTypes != null){
follow.addAll(followPosTypes);
}
this.followTypes = Collections.unmodifiableSet(follow);
if(minPosProb > 1){
throw new IllegalArgumentException("The minimum POS tag probalility MUST BE set to a value [0..1] or values < 0 to deactivate this feature (parsed="+minPosProb+")!");
} else {
this.minPosProb = minPosProb;
}
}
/**
* @param props the probabilities of the pos tags or <code>null</code> if
* not available
* @param pos the POS tags
* @return <code>true</code> if follow
*/
private boolean followPOS(double[] props,String... pos){
boolean reject = false;
for(int i=0;i<pos.length;i++){
if(props == null || props[i] >= minPosProb){
if(followTypes.contains(pos[i])){
return true;
} else {
reject = true;
}
} //else prob to low ... do not process
}
//in case we have not found a POS tag with a prob > minPosProb
//return TRUE
return !reject;
}
private boolean includePOS(double[] props,String... pos){
boolean reject = false;
for(int i=0;i<pos.length;i++){
if(props == null || props[i] >= minPosProb){
if(buildTypes.contains(pos[i])){
return true;
} else {
reject = true;
}
}
}
//in case we have not found a POS tag with a prob > minPosProb
//return TRUE
return !reject;
}
/**
* The set of POS types followed to extend Chunks. This includes the
* {@link #getChunkPosTypes()} values
* @return the followTypes
*/
public final Set<String> getFollowedPosTypes() {
return followTypes;
}
/**
* The set of POS types used to create Chunks
* @return the buildTypes
*/
public final Set<String> getChunkPosTypes() {
return buildTypes;
}
/**
* Build the chunks based on the parsed tokens and POS tags. <p>
* This method is the equivalent to
* {@link opennlp.tools.chunker.Chunker#chunkAsSpans(String[], String[])}
* @param tokens the tokens
* @param tags the POS tags for the tokens
* @return the chunks as spans over the parsed tokens
*/
public Span[] chunkAsSpans(String[] tokens, String[] tags) {
int consumed = -1;
List<Span> chunks = new ArrayList<Span>();
for(int i=0;i<tokens.length;i++){
if(includePOS(null,tags[i])){
int start = i;
while(start-1 > consumed && followPOS(null,tags[start-1])){
start--; //follow backwards until consumed
}
int followEnd = i;
int end = i;
while(followEnd+1 < tokens.length && followPOS(null,tags[followEnd+1])){
followEnd++; //follow
if(includePOS(null,tags[followEnd])){
end = followEnd; //extend end only if act is include
}
}
chunks.add(new Span(start,end));
// consumed = end;
i = followEnd;
}//build no chunk for this token
}
return chunks.toArray(new Span[chunks.size()]);
}
/**
* Build the chunks based on the parsed tokens and the one or more detected
* POS tags alternatives for the tokens. <p>
* @param tokens the tokens
* @param tags the POS tags for the tokens (1D:tokens; 2D:POS tags)
* @return the chunks as spans over the parsed tokens
*/
public Span[] chunkAsSpans(String[] tokens, String[][] tags,double[][]props) {
//NOTE: this is a 1:1 copy of the above method!! However this is the
// only solution, because merging them into a single one would
// need to copy the Stirng[] of the other into a String[][1] as
// used by this one :(
// If someone has a better Idea feel free to change!
// Rupert Westenthaler (28.Sep.2011)
int consumed = -1;
List<Span> chunks = new ArrayList<Span>();
for(int i=0;i<tokens.length;i++){
if(includePOS(props[i],tags[i])){
int start = i;
//do not follow backwards!
while(start-1 > consumed && followPOS(props[start-1],tags[start-1])){
start--; //follow backwards until consumed
}
int followEnd = i;
int end = i;
while(followEnd+1 < tokens.length && followPOS(props[followEnd+1],tags[followEnd+1])){
followEnd++; //follow
if(includePOS(props[followEnd],tags[followEnd])){
end = followEnd; //extend end only if act is include
}
}
chunks.add(new Span(start,end));
// consumed = end;
i = followEnd;
}//build no chunk for this token
}
return chunks.toArray(new Span[chunks.size()]);
}
}