/*
* Copyright 2013
* Ubiquitous Knowledge Processing (UKP) Lab and FG Language Technology
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.api.io;
import java.util.List;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationFS;
import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider;
/**
* Creates Chunk annotations from IOB encoded data.
* For example, the sequence (B-NP I-NP) will be converted into a NP-chunk
* annotation spanning two tokens.
*
*
*/
public class IobDecoder
{
private CAS cas;
private Feature chunkValue;
private MappingProvider mappingProvider;
private boolean internTags = true;
private String openChunk;
private int start;
private int end;
public IobDecoder(CAS aCas, Feature aChunkValue, MappingProvider aMappingProvider)
{
super();
cas = aCas;
chunkValue = aChunkValue;
mappingProvider = aMappingProvider;
}
public void setInternTags(boolean aInternTags)
{
internTags = aInternTags;
}
public void decode(List<? extends AnnotationFS> aTokens, String[] aChunkTags)
{
int i = 0;
for (AnnotationFS token : aTokens) {
// System.out.printf("%s %s %n", token.getCoveredText(), aChunkTags[i]);
String fields[] = aChunkTags[i].split("-");
String flag = fields.length == 2 ? fields[0] : "NONE";
String chunk = fields.length == 2 ? fields[1] : null;
// Start of a new hunk
if (chunk == null || !chunk.equals(openChunk) || "B".equals(flag)) {
if (openChunk != null) {
// End of previous chunk
chunkComplete();
}
if ("O".equals(flag)) {
openChunk = null;
}
else {
openChunk = chunk;
}
start = token.getBegin();
}
// Record how much of the chunk we have seen so far
end = token.getEnd();
i++;
}
// End of processing signal
chunkComplete();
}
private void chunkComplete()
{
if (openChunk != null) {
Type chunkType = mappingProvider.getTagType(openChunk);
AnnotationFS chunk = cas.createAnnotation(chunkType, start, end);
chunk.setStringValue(chunkValue, internTags ? openChunk.intern() :
openChunk);
cas.addFsToIndexes(chunk);
openChunk = null;
}
}
}