/* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.lif.internal; import static org.apache.uima.fit.util.JCasUtil.select; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.util.LinkedHashSet; import java.util.Set; import java.util.TreeSet; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.cas.TOP; import org.lappsgrid.discriminator.Discriminators; import org.lappsgrid.serialization.lif.Annotation; import org.lappsgrid.serialization.lif.Container; import org.lappsgrid.serialization.lif.View; import org.lappsgrid.vocabulary.Features; import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Paragraph; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.dependency.Dependency; import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.objects.Object2IntOpenHashMap; public class DKPro2Lif { private static final String PHRASE_STRUCTURE = "phrasestruct"; private static final String CONSTITUENT = "const"; private static final String DEPENDENCY_STRUCTURE = "depstruct"; private static final String DEPENDENCY = "dep"; private static final String PARAGRAPH = "para"; private static final String SENTENCE = "sent"; private static final String TOKEN = "tok"; private static final String NAMED_ENTITY = "ne"; private Object2IntOpenHashMap<String> counters = new Object2IntOpenHashMap<>(); private Int2IntOpenHashMap ids = new Int2IntOpenHashMap(); public void convert(JCas aJCas, Container container) { container.setLanguage(aJCas.getDocumentLanguage()); container.setText(aJCas.getDocumentText()); View view = container.newView(); // Paragraph for (Paragraph p : select(aJCas, Paragraph.class)) { view.newAnnotation(id(PARAGRAPH, p), Discriminators.Uri.PARAGRAPH, p.getBegin(), p.getEnd()); } // Sentence for (Sentence s : select(aJCas, Sentence.class)) { view.newAnnotation(id(SENTENCE, s), Discriminators.Uri.SENTENCE, s.getBegin(), s.getEnd()); } // Token, POS, Lemma for (Token t : select(aJCas, Token.class)) { Annotation a = view.newAnnotation(id(TOKEN, t), Discriminators.Uri.TOKEN, t.getBegin(), t.getEnd()); if (t.getPos() != null) { a.addFeature(Features.Token.POS, t.getPos().getPosValue()); } if (t.getLemma() != null) { a.addFeature(Features.Token.LEMMA, t.getLemma().getValue()); } } // NamedEntity for (NamedEntity neAnno : select(aJCas, NamedEntity.class)) { Annotation ne = view.newAnnotation(id(NAMED_ENTITY, neAnno), Discriminators.Uri.NE, neAnno.getBegin(), neAnno.getEnd()); ne.setLabel(neAnno.getValue()); } // Dependency for (Sentence s : select(aJCas, Sentence.class)) { Set<String> depRelIds = new TreeSet<>(); for (Dependency dep : selectCovered(Dependency.class, s)) { String depRelId = id(DEPENDENCY, dep); // LAPPS dependencies inherit from Relation which has no offsets Annotation depRel = view.newAnnotation(depRelId, Discriminators.Uri.DEPENDENCY); depRel.setLabel(dep.getDependencyType()); depRel.addFeature(Features.Dependency.GOVERNOR, id(TOKEN, dep.getGovernor())); depRel.addFeature(Features.Dependency.DEPENDENT, id(TOKEN, dep.getDependent())); depRelIds.add(depRelId); } if (!depRelIds.isEmpty()) { Annotation depStruct = view.newAnnotation(id(DEPENDENCY_STRUCTURE, s), Discriminators.Uri.DEPENDENCY_STRUCTURE, s.getBegin(), s.getEnd()); depStruct.addFeature(Features.DependencyStructure.DEPENDENCIES, depRelIds); } } // Constituents for (ROOT r : select(aJCas, ROOT.class)) { Set<String> constituents = new LinkedHashSet<>(); convertConstituent(view, r, constituents); Annotation phraseStruct = view.newAnnotation(id(PHRASE_STRUCTURE, r), Discriminators.Uri.PHRASE_STRUCTURE, r.getBegin(), r.getEnd()); phraseStruct.addFeature(Features.PhraseStructure.CONSTITUENTS, constituents); } } private void convertConstituent(View aView, org.apache.uima.jcas.tcas.Annotation aNode, Set<String> aConstituents) { if (aNode instanceof Constituent) { // LAPPS constituents inherit from Relation which has no offsets Annotation constituent = aView.newAnnotation(id(CONSTITUENT, aNode), Discriminators.Uri.CONSTITUENT); aConstituents.add(constituent.getId()); for (org.apache.uima.jcas.tcas.Annotation child : select( ((Constituent) aNode).getChildren(), org.apache.uima.jcas.tcas.Annotation.class)) { convertConstituent(aView, child, aConstituents); } } else if (aNode instanceof Token) { aConstituents.add(id(TOKEN, aNode)); } else { throw new IllegalStateException("Unexpected node type: " + aNode); } } private String id(String aPrefix, TOP aFS) { int id; // if we already have an ID for the given FS return it if (ids.containsKey(aFS.getAddress())) { id = ids.get(aFS.getAddress()); } // otherwise generate a new ID else { id = counters.getInt(aPrefix); ids.put(aFS.getAddress(), id); counters.put(aPrefix, id + 1); } return aPrefix + '-' + id; } }