/*
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.lif.internal;
import static org.apache.commons.lang.StringUtils.isEmpty;
import static org.apache.commons.lang.StringUtils.isNotEmpty;
import static org.apache.uima.fit.util.JCasUtil.select;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import org.apache.uima.fit.util.FSCollectionFactory;
import org.apache.uima.jcas.JCas;
import org.lappsgrid.discriminator.Discriminators;
import org.lappsgrid.serialization.lif.Annotation;
import org.lappsgrid.serialization.lif.Container;
import org.lappsgrid.serialization.lif.View;
import org.lappsgrid.vocabulary.Features;
import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS;
import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency;
import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.ROOT;
public class Lif2DKPro
{
public void convert(Container aContainer, JCas aJCas)
{
aJCas.setDocumentLanguage(aContainer.getLanguage());
aJCas.setDocumentText(aContainer.getText());
View view = aContainer.getView(0);
// Paragraph
view.getAnnotations().stream()
.filter(a -> Discriminators.Uri.PARAGRAPH.equals(a.getAtType()))
.forEach(
para -> {
Paragraph paraAnno = new Paragraph(aJCas, para.getStart().intValue(),
para.getEnd().intValue());
paraAnno.addToIndexes();
});
// Sentence
view.getAnnotations().stream()
.filter(a -> Discriminators.Uri.SENTENCE.equals(a.getAtType()))
.forEach(
sent -> {
Sentence sentAnno = new Sentence(aJCas, sent.getStart().intValue(),
sent.getEnd().intValue());
sentAnno.addToIndexes();
});
Map<String, Token> tokenIdx = new HashMap<>();
// Token, POS, Lemma
view.getAnnotations().stream()
.filter(a -> Discriminators.Uri.TOKEN.equals(a.getAtType()))
.forEach(
token -> {
Token tokenAnno = new Token(aJCas, token.getStart().intValue(), token
.getEnd().intValue());
String pos = token.getFeature(Features.Token.POS);
String lemma = token.getFeature(Features.Token.LEMMA);
if (isNotEmpty(pos)) {
POS posAnno = new POS(aJCas, tokenAnno.getBegin(), tokenAnno
.getEnd());
posAnno.setPosValue(pos.intern());
posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null
: posAnno.getType().getShortName().intern());
posAnno.addToIndexes();
tokenAnno.setPos(posAnno);
}
if (isNotEmpty(lemma)) {
Lemma lemmaAnno = new Lemma(aJCas, tokenAnno.getBegin(), tokenAnno
.getEnd());
lemmaAnno.setValue(lemma);
lemmaAnno.addToIndexes();
tokenAnno.setLemma(lemmaAnno);
}
tokenAnno.addToIndexes();
tokenIdx.put(token.getId(), tokenAnno);
});
// NamedEntity
view.getAnnotations().stream()
.filter(a -> Discriminators.Uri.NE.equals(a.getAtType()))
.forEach(
ne -> {
NamedEntity neAnno = new NamedEntity(aJCas, ne.getStart().intValue(),
ne.getEnd().intValue());
neAnno.setValue(ne.getLabel());
neAnno.addToIndexes();
});
// Dependencies
view.getAnnotations().stream()
.filter(a -> Discriminators.Uri.DEPENDENCY.equals(a.getAtType()))
.forEach(
dep -> {
String dependent = dep.getFeature(Features.Dependency.DEPENDENT);
String governor = dep.getFeature(Features.Dependency.GOVERNOR);
if (isEmpty(governor) || governor.equals(dependent)) {
ROOT depAnno = new ROOT(aJCas);
depAnno.setDependencyType(dep.getLabel());
depAnno.setDependent(tokenIdx.get(dependent));
depAnno.setGovernor(tokenIdx.get(dependent));
depAnno.setBegin(depAnno.getDependent().getBegin());
depAnno.setEnd(depAnno.getDependent().getEnd());
depAnno.addToIndexes();
}
else {
Dependency depAnno = new Dependency(aJCas);
depAnno.setDependencyType(dep.getLabel());
depAnno.setDependent(tokenIdx.get(dependent));
depAnno.setGovernor(tokenIdx.get(governor));
depAnno.setBegin(depAnno.getDependent().getBegin());
depAnno.setEnd(depAnno.getDependent().getEnd());
depAnno.addToIndexes();
}
});
// Constituents
view.getAnnotations().stream()
.filter(a -> Discriminators.Uri.PHRASE_STRUCTURE.equals(a.getAtType()))
.forEach(
ps -> {
String rootId = findRoot(view, ps);
// Get the constituent IDs
Set<String> constituentIDs;
constituentIDs = new HashSet<>(
getSetFeature(ps,Features.PhraseStructure.CONSTITUENTS));
List<Annotation> constituents = new ArrayList<>();
Map<String, Constituent> constituentIdx = new HashMap<>();
// Instantiate all the constituents
view.getAnnotations().stream()
.filter(a -> constituentIDs.contains(a.getId()))
.forEach(con -> {
if (Discriminators.Uri.CONSTITUENT.equals(con.getAtType())) {
Constituent conAnno;
if (rootId.equals(con.getId())) {
conAnno = new de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT(aJCas);
}
else {
conAnno = new Constituent(aJCas);
}
if (con.getStart() != null) {
conAnno.setBegin(con.getStart().intValue());
}
if (con.getEnd() != null) {
conAnno.setEnd(con.getEnd().intValue());
}
conAnno.setConstituentType(con.getLabel());
constituentIdx.put(con.getId(), conAnno);
constituents.add(con);
}
// If it is not a constituent, it must be a token ID - we already
// have created the tokens and recorded them in the tokenIdx
});
// Set parent and children features
constituents.forEach(con -> {
// Check if it is a constituent or token
Constituent conAnno = constituentIdx.get(con.getId());
Set<String> childIDs = getSetFeature(con,
Features.Constituent.CHILDREN);
List<org.apache.uima.jcas.tcas.Annotation> children = new ArrayList<>();
childIDs.forEach(childID -> {
Constituent conChild = constituentIdx.get(childID);
Token tokenChild = tokenIdx.get(childID);
if (conChild != null && tokenChild == null) {
conChild.setParent(conAnno);
children.add(conChild);
}
else if (conChild == null && tokenChild != null) {
tokenChild.setParent(conAnno);
children.add(tokenChild);
}
else if (conChild == null && tokenChild == null) {
throw new IllegalStateException("ID [" + con.getId()
+ "] not found");
}
else {
throw new IllegalStateException("ID [" + con.getId()
+ "] is constituent AND token? Impossible!");
}
});
conAnno.setChildren(FSCollectionFactory.createFSArray(aJCas, children));
});
// Percolate offsets - they might not have been set on the constituents!
Constituent root = constituentIdx.get(rootId);
percolateOffsets(root);
// Add to indexes
constituentIdx.values().forEach(conAnno -> {
conAnno.addToIndexes();
});
});
}
@SuppressWarnings("unchecked")
private <T> Set<T> getSetFeature(Annotation aAnnotation, String aName)
{
return aAnnotation.getFeatureSet(aName);
}
private void percolateOffsets(org.apache.uima.jcas.tcas.Annotation aNode)
{
if (aNode instanceof Constituent) {
Constituent conAnno = (Constituent) aNode;
int begin = Integer.MAX_VALUE;
int end = 0;
for (org.apache.uima.jcas.tcas.Annotation a : select(conAnno.getChildren(),
org.apache.uima.jcas.tcas.Annotation.class)) {
percolateOffsets(a);
begin = Math.min(a.getBegin(), begin);
end = Math.max(a.getEnd(), end);
}
if (aNode.getBegin() != 0) {
assert begin == aNode.getBegin();
}
else {
aNode.setBegin(begin);
}
if (aNode.getEnd() != 0) {
assert end == aNode.getEnd();
}
else {
aNode.setEnd(end);
}
}
}
private String findRoot(View aView, Annotation aPS)
{
// Get all the constituents int he phrase structure
Set<String> constituents = new HashSet<>(
getSetFeature(aPS, Features.PhraseStructure.CONSTITUENTS));
List<Annotation> psConstituents = aView.getAnnotations().stream()
.filter(a -> Discriminators.Uri.CONSTITUENT.equals(a.getAtType()))
.filter(con -> constituents.contains(con.getId()))
.collect(Collectors.toList());
// Remove all constituents that are children of other constituents within the PS
psConstituents.forEach(con -> {
Set<String> children = getSetFeature(con, Features.Constituent.CHILDREN);
children.forEach(child -> constituents.remove(child));
});
// If all went well, only one constituent should be left and that is the root constituent
assert 1 == constituents.size();
// Return the ID of the root constituent
return constituents.iterator().next();
}
}