/* * Copyright 2014 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package de.tudarmstadt.ukp.dkpro.core.io.penntree; import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.trim; import static de.tudarmstadt.ukp.dkpro.core.io.penntree.PennTreeUtils.unescapeToken; import static org.apache.commons.lang.StringUtils.isBlank; import static org.apache.uima.fit.util.FSCollectionFactory.createFSArray; import static org.apache.uima.fit.util.JCasUtil.selectCovered; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.uima.cas.CASException; import org.apache.uima.cas.Type; import org.apache.uima.jcas.JCas; import org.apache.uima.jcas.tcas.Annotation; import de.tudarmstadt.ukp.dkpro.core.api.lexmorph.type.pos.POS; import de.tudarmstadt.ukp.dkpro.core.api.resources.MappingProvider; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.Constituent; import de.tudarmstadt.ukp.dkpro.core.api.syntax.type.constituent.ROOT; public class PennTreeToJCasConverter { private static final String ROOT = "ROOT"; private static final String NONE = "-NONE-"; private boolean writeTracesToText; private boolean createPosTags; private boolean internTags; private String rootLabel = ROOT; private MappingProvider posMappingProvider; private MappingProvider constituentMappingProvider; public PennTreeToJCasConverter(MappingProvider aPosMappingProvider, MappingProvider aConstituentMappingProvider) { posMappingProvider = aPosMappingProvider; constituentMappingProvider = aConstituentMappingProvider; } public boolean isWriteTracesToText() { return writeTracesToText; } public void setWriteTracesToText(boolean aWriteTracesToText) { writeTracesToText = aWriteTracesToText; } public boolean isCreatePosTags() { return createPosTags; } public void setCreatePosTags(boolean aCreatePosTags) { createPosTags = aCreatePosTags; } public boolean isInternTags() { return internTags; } public void setInternTags(boolean aInternTags) { internTags = aInternTags; } public String getRootLabel() { return rootLabel; } public void setRootLabel(String aRootLabel) { rootLabel = aRootLabel; } public Constituent convertPennTree(JCas aJCas, StringBuilder aText, PennTreeNode aNode) { return convertPennTree(aJCas, aText, aNode, null, true); } private Constituent convertPennTree(JCas aJCas, StringBuilder aText, PennTreeNode aNode, Constituent aParent, boolean aBOS) { boolean bos = aBOS; Constituent constituent = null; Constituent parent = aParent; boolean generatedParent = false; // Do we need to insert an artificial ROOT node? if (aParent == null) { // Case 2: no root node: (S if (!rootLabel.equals(aNode.getLabel()) && !isBlank(aNode.getLabel())) { constituent = createConstituent(aJCas, aNode.getLabel()); parent = new ROOT(aJCas); parent.setConstituentType(ROOT); parent.setChildren(createFSArray(aJCas, new Constituent[] { constituent })); generatedParent = true; } // Case 1: unlabeled root node: ( (S... // Case 3: labeled root node: (ROOT (S... else { constituent = new ROOT(aJCas); constituent.setConstituentType(ROOT); } } else { constituent = createConstituent(aJCas, aNode.getLabel()); } constituent.setBegin(aText.length()); List<Annotation> children = new ArrayList<Annotation>(); for (PennTreeNode c : aNode.getChildren()) { if (c.isPreTerminal()) { // Do not read traces into the CAS, at least not as tokens if (!writeTracesToText && NONE.equals(c.getLabel())) { continue; } // Add space between tokens with inside sentence. Do not add token at the beginning // of the sentence, even if we append into a larger document. if (!bos) { aText.append(' '); } // Add to the document test int begin = aText.length(); aText.append(unescapeToken(c.getChildren().get(0).getLabel())); int end = aText.length(); Token token = new Token(aJCas, begin, end); // only add POS to index if we want POS-tagging if (isCreatePosTags()) { token.setPos(createPOS(aJCas, c, begin, end)); } token.setParent(constituent); token.addToIndexes(); children.add(token); } else { children.add(convertPennTree(aJCas, aText, c, constituent, bos)); } bos = false; } constituent.setEnd(aText.length()); int[] offsets = {constituent.getBegin(), constituent.getEnd()}; trim(aText, offsets); constituent.setBegin(offsets[0]); constituent.setEnd(offsets[1]); constituent.setChildren(createFSArray(aJCas, children)); constituent.setParent(parent); constituent.addToIndexes(); // We we created an additional ROOT node, then we need to set its offsets as well if (generatedParent) { parent.setBegin(constituent.getBegin()); parent.setEnd(constituent.getEnd()); parent.addToIndexes(); } return constituent; } public Constituent convertPennTree(Sentence aSentence, PennTreeNode aNode) { JCas jcas; try { jcas = aSentence.getCAS().getJCas(); } catch (CASException e) { throw new IllegalStateException(e); } List<Token> tokens = selectCovered(Token.class, aSentence); List<PennTreeNode> preTerminalNodes = PennTreeUtils.getPreTerminals(aNode); Map<PennTreeNode, Token> tokenMap = new HashMap<>(); for (int i = 0; i < tokens.size(); i++) { tokenMap.put(preTerminalNodes.get(i), tokens.get(i)); } return convertPennTree(jcas, aNode, null, tokenMap); } private Constituent convertPennTree(JCas aJCas, PennTreeNode aNode, Constituent aParent, Map<PennTreeNode, Token> aTokenMap) { Constituent constituent = null; Constituent parent = aParent; boolean generatedParent = false; // Do we need to insert an artificial ROOT node? if (aParent == null) { // Case 2: no root node: (S if (!rootLabel.equals(aNode.getLabel()) && !isBlank(aNode.getLabel())) { constituent = createConstituent(aJCas, aNode.getLabel()); parent = new ROOT(aJCas); parent.setConstituentType(ROOT); parent.setChildren(createFSArray(aJCas, new Constituent[] { constituent })); generatedParent = true; } // Case 1: unlabeled root node: ( (S... // Case 3: labeled root node: (ROOT (S... else { constituent = new ROOT(aJCas); constituent.setConstituentType(ROOT); } } else { constituent = createConstituent(aJCas, aNode.getLabel()); } List<Annotation> children = new ArrayList<Annotation>(); for (PennTreeNode c : aNode.getChildren()) { if (c.isPreTerminal()) { Token token = aTokenMap.get(c); token.setParent(constituent); // only add POS to index if we want POS-tagging if (isCreatePosTags()) { token.setPos(createPOS(aJCas, c, token.getBegin(), token.getEnd())); } children.add(token); } else { children.add(convertPennTree(aJCas, c, constituent, aTokenMap)); } } constituent.setBegin(children.get(0).getBegin()); constituent.setEnd(children.get(children.size()-1).getEnd()); constituent.setChildren(createFSArray(aJCas, children)); constituent.setParent(parent); constituent.addToIndexes(); // We we created an additional ROOT node, then we need to set its offsets as well if (generatedParent) { parent.setBegin(constituent.getBegin()); parent.setEnd(constituent.getEnd()); parent.addToIndexes(); } return constituent; } private POS createPOS(JCas aJCas, PennTreeNode aPreterminal, int aBegin, int aEnd) { POS posAnno; if (posMappingProvider != null) { Type posTag = posMappingProvider.getTagType(aPreterminal.getLabel()); posAnno = (POS) aJCas.getCas().createAnnotation(posTag, aBegin, aEnd); } else { posAnno = new POS(aJCas, aBegin, aEnd); } posAnno.setPosValue(internTags ? aPreterminal.getLabel().intern() : aPreterminal .getLabel()); posAnno.setCoarseValue(posAnno.getClass().equals(POS.class) ? null : posAnno.getType().getShortName().intern()); posAnno.addToIndexes(); return posAnno; } private Constituent createConstituent(JCas aJCas, String aLabel) { if (NONE.equals(aLabel)) { return new Constituent(aJCas); } String[] label = aLabel.split("-"); Constituent constituentAnno; if (constituentMappingProvider != null) { Type constituentTag = constituentMappingProvider.getTagType(label[0]); // We just set a dummy value for the offsets here. These need to be fixed when we know the // children and before addToIndexes() is called. constituentAnno = (Constituent) aJCas.getCas().createAnnotation(constituentTag, 0, 0); } else { constituentAnno = new Constituent(aJCas, 0, 0); } constituentAnno.setConstituentType(label[0]); if (label.length >= 2) { constituentAnno.setSyntacticFunction(label[1]); } return constituentAnno; } }