/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.uby.resource; import static de.tudarmstadt.ukp.uby.resource.UbyResourceUtils.corePosToUbyPos; import static de.tudarmstadt.ukp.uby.resource.UbyResourceUtils.getMostFrequentSense; import java.util.ArrayList; import java.util.List; import java.util.Map; import org.apache.commons.lang.StringUtils; import org.apache.uima.fit.component.Resource_ImplBase; import org.apache.uima.fit.descriptor.ConfigurationParameter; import org.apache.uima.fit.descriptor.ExternalResource; import org.apache.uima.resource.ResourceAccessException; import org.apache.uima.resource.ResourceInitializationException; import org.apache.uima.resource.ResourceSpecifier; import de.tudarmstadt.ukp.dkpro.core.api.parameter.ComponentParameters; import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; import de.tudarmstadt.ukp.dkpro.core.dictionaryannotator.semantictagging.SemanticTagProvider; import de.tudarmstadt.ukp.lmf.api.Uby; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.Lexicon; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics; import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech; import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel; /** * * This shared resource can be added as ExternalResource in Analysis Engines * that annotate common nouns, main verbs and adjectives * with semantic field information from WordNet (for English text) * or GermaNet (for German text). * * @author Judith Eckle-Kohler * */ public class UbySemanticFieldResource extends Resource_ImplBase implements SemanticTagProvider { public static final String RES_UBY = "uby"; @ExternalResource(key = RES_UBY) private Uby uby; public static final String PARAM_LANGUAGE = ComponentParameters.PARAM_LANGUAGE; @ConfigurationParameter(name = PARAM_LANGUAGE, mandatory = false) protected String language; private Lexicon wordnet; @Override public boolean initialize(ResourceSpecifier aSpecifier, Map aAdditionalParams) throws ResourceInitializationException { if (!super.initialize(aSpecifier, aAdditionalParams)) { return false; } return true; } @Override public String getSemanticTag(Token token) throws ResourceAccessException { String semanticField = "UNKNOWN"; try { // the documentLanguage is specified as ISO 2-letter code (following the DKPro-Core convention) if (token.getCAS().getDocumentLanguage().equals("en")) { wordnet = uby.getLexiconByName("WordNet"); } else if (language.equals("en")) { wordnet = uby.getLexiconByName("WordNet"); } else if (token.getCAS().getDocumentLanguage().equals("de")) { wordnet = uby.getLexiconByName("GermaNet"); } else if (language.equals("de")) { wordnet = uby.getLexiconByName("GermaNet"); } // does the token have a POS which has relevant information in the lexicon? if (corePosToUbyPos(token.getPos().getType().getShortName()).length == 0) { return "UNKNOWN"; // is the lemma contained in any of the UBY lexicons? } else if (uby.getLexicalEntries(token.getLemma().getValue(),null,null).isEmpty()) { return "UNKNOWN"; } else { // there is at least one UBY lexicon that contains the lemma for (EPartOfSpeech pos : corePosToUbyPos(token.getPos().getType().getShortName())) { if (!uby.getLexicalEntries(token.getLemma().getValue(),pos,wordnet).isEmpty()) { // the lemma is listed in the English or German wordnet with the given POS List<LexicalEntry> lexicalEntries = uby.getLexicalEntries(token.getLemma().getValue(),pos,wordnet); Sense sense = getWordnetSense(lexicalEntries); return getSemanticField(sense); } else { // find the UBY lexical entry for the given lemma, // get a semantic label of type domain, if it exists // and retrieve the semantic field of the domain label List<LexicalEntry> lexicalEntries = uby.getLexicalEntries(token.getLemma().getValue(),null,null); String otherSemanticLabelValue = getOtherSemanticLabelValue(lexicalEntries); return getSemanticField(otherSemanticLabelValue); } } } return semanticField; } catch (Exception e) { throw new ResourceAccessException(e); } } @Override public String getSemanticTag(List<Token> tokens) throws ResourceAccessException { List<String> lemmas = new ArrayList<String>(); for (Token token : tokens) { lemmas.add(token.getLemma().getValue()); } String lemmaString = StringUtils.join(lemmas, " "); try { // the documentLanguage is specified as ISO 2-letter code (following the DKPro-Core convention) if (tokens.get(0).getCAS().getDocumentLanguage().equals("en")) { wordnet = uby.getLexiconByName("WordNet"); } else if (tokens.get(0).getCAS().getDocumentLanguage().equals("de")) { wordnet = uby.getLexiconByName("GermaNet"); } // we do not check, if the lemmaString has an entry in the lexicon with a POS corresponding to the Core POS type // because multiwords tend to have non-consistent POS assigned in the lexicon if (uby.getLexicalEntries(lemmaString,null,null).isEmpty()) { return "UNKNOWN"; } else { // there is at least one UBY lexicon that contains the multiword as lemma if (!uby.getLexicalEntries(lemmaString,null,wordnet).isEmpty()) { // the lemma is listed in the English or German wordnet List<LexicalEntry> lexicalEntries = uby.getLexicalEntries(lemmaString,null,wordnet); Sense sense = getWordnetSense(lexicalEntries); return getSemanticField(sense); } else { // find the UBY lexical entry for the given lemma, // get a semantic label of type domain, if it exists // and retrieve the semantic field of the domain label List<LexicalEntry> lexicalEntries = uby.getLexicalEntries(lemmaString,null,null); String otherSemanticLabelValue = getOtherSemanticLabelValue(lexicalEntries); return getSemanticField(otherSemanticLabelValue); } } } catch (Exception e) { throw new ResourceAccessException(e); } } private Sense getWordnetSense(List<LexicalEntry> lexicalEntries) { Sense sense = null; if (wordnet.getName().equals("WordNet")) { // WordNet contains MFS information, since the senses are ordered by decreasing frequency in SemCor: // in UBY, this is the sense with index = 1 sense = getMostFrequentSense(lexicalEntries); } else if (wordnet.getName().equals("GermaNet")) { // GermaNet does not contain MFS information; the first sense is used sense = lexicalEntries.get(0).getSenses().get(0); } return sense; } private String getSemanticField(Sense sense) { String semanticField = "UNKNOWN"; if (!(sense == null)) { for (SemanticLabel sl : sense.getSemanticLabels()) { if (sl.getType().toString().matches("semanticField")) { semanticField = sl.getLabel(); semanticField = semanticField.replaceAll(".*\\.", ""); } } } else { System.out.println("sense was null"); } return semanticField; } private String getOtherSemanticLabelValue(List<LexicalEntry> lexicalEntries) { String semanticLabelValue = ""; // grab the first entry with a semantic label of type domain for (LexicalEntry lexicalEntry:lexicalEntries) { for (Sense s:lexicalEntry.getSenses()) { for (SemanticLabel sl:s.getSemanticLabels()) { if (sl.getType().equals(ELabelTypeSemantics.domain)) { semanticLabelValue = sl.getLabel(); break; } } } } return semanticLabelValue; } private String getSemanticField(String semanticLabelValue) { // get the semantic field of a semantic label value of type "domain" String semanticField = "UNKNOWN"; if (!uby.getLexicalEntries(semanticLabelValue,null,wordnet).isEmpty()) { List<LexicalEntry> lexicalEntries = uby.getLexicalEntries(semanticLabelValue,null,wordnet); Sense sense = getWordnetSense(lexicalEntries); semanticField = getSemanticField(sense); } return semanticField; } }