/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.uby.uima.writer;
import static de.tudarmstadt.ukp.uby.resource.UbyResourceUtils.getMostFrequentSense;
import java.io.BufferedWriter;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ExternalResource;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence;
import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
import de.tudarmstadt.ukp.dkpro.core.api.semantics.type.SemanticField;
import de.tudarmstadt.ukp.lmf.api.Uby;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticCategory;
import de.tudarmstadt.ukp.lmf.model.enums.EVerbForm;
import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel;
import de.tudarmstadt.ukp.lmf.model.semantics.PredicativeRepresentation;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tudarmstadt.ukp.lmf.model.semantics.SynsetRelation;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticArgument;
import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticBehaviour;
/**
* @author Eckle-Kohler
*
*/
public class SemanticTagWriter
extends org.apache.uima.fit.component.JCasAnnotator_ImplBase
{
/**
* Name of the output file
*/
public static final String PARAM_TARGET_LOCATION = "outputParam";
@ConfigurationParameter(name = PARAM_TARGET_LOCATION, mandatory = true)
private String outputParam;
public static final String RES_UBY = "uby";
@ExternalResource(key = RES_UBY)
private Uby uby;
private BufferedWriter writer;
private static ArrayList<String> auxiliariesAndModals = new ArrayList<String>(
Arrays.asList("be", "do", "have", "can", "will", "should", "must"));
@Override
public void initialize(UimaContext context)
throws ResourceInitializationException
{
super.initialize(context);
try {
writer = new BufferedWriter(new FileWriter(outputParam));
}
catch (IOException ex) {
throw new ResourceInitializationException(ex);
}
}
@Override
public void process(JCas jcas)
throws AnalysisEngineProcessException
{
for (Sentence sentence : JCasUtil.select(jcas, Sentence.class)) {
List<Token> sentenceTokens = JCasUtil.selectCovered(jcas, Token.class, sentence);
for (int i = 0; i < sentenceTokens.size(); i++) {
Token token = sentenceTokens.get(i);
Sense mfs = null;
// Uby provides lexical information mainly for content words: nouns, main verbs, adjectives;
// auxiliary and modal verbs are contained in Uby, but in running text they are rarely used as main verbs,
// but mostly as function words (to form particular tense and voice constructions) or as modality markers
if ((token.getPos().getType().getShortName().equals("V") ||
token.getPos().getType().getShortName().matches("N.*") ||
token.getPos().getType().getShortName().equals("ADJ")) &&
!auxiliariesAndModals.contains(token.getLemma().getValue())) {
mfs = getMostFrequentSense(uby.getLexicalEntries(token.getLemma().getValue(), null, null));
}
// write lemma, POS annotations and results of Uby lookup to the output file:
String syntacticBehaviour = getSyntacticBehaviour(token.getPos().getType().getShortName(),uby.getLexicalEntries(token.getLemma().getValue(),
EPartOfSpeech.verb, null));
List<SemanticField> semanticFieldAnnotations = JCasUtil.selectCovering(jcas,
SemanticField.class, token.getBegin(), token.getEnd());
for (int j = 0; j < semanticFieldAnnotations.size(); j++) {
SemanticField semanticField = semanticFieldAnnotations.get(j);
String semFieldValue = "---";
if (semanticField.getValue().equals("UNKNOWN")) {
semFieldValue = "---";
} else {
semFieldValue = semanticField.getValue();
}
if (mfs != null && mfs.getSynset() != null
&& !auxiliariesAndModals.contains(token.getLemma().getValue())) {
writeTokenAndSemanticField(token.getCoveredText() + "\t"
+ token.getLemma().getValue() + "\t"
+ token.getPos().getType().getShortName() + "\n"
+ "\t syntax: " +syntacticBehaviour + "\n"
// for retrieving semantic field, synonyms and semantically related words, the word is disambiguated
// according to the MFS heuristic
+ "\t semantic field: " +semFieldValue + "\n"
+ "\t synonyms: " +getSynonymousWords(token.getLemma().getValue(), mfs.getSynset()) + "\n"
+ "\t related: " +getSemanticallyRelatedWords(mfs.getSynset()) + "\n"
// "associated topics" means something like creatively associating topics with a given word
// for constructing creative associations, disambiguation is not necessary (it actually limits association links)
+ "\t associated: " +getSemanticLabels(uby.getLexicalEntries(token.getLemma().getValue(), null, null)) + "\n"
);
} else {
writeTokenAndSemanticField(token.getCoveredText() + "\t"
+ token.getLemma().getValue() + "\t"
+ token.getPos().getType().getShortName() + "\n"
);
}
}
}
}
}
/*
* This method groups the complex subcat frames into four classes:
* transitive, intransitive, transitive with to-infinitive, intransitive
* with to-infinitive this four-way classification could be useful in many
* (linguistic or text classification) contexts, because all four classes
* have a distinct lexical semantics
*/
private String getSyntacticBehaviour(String pos,
List<LexicalEntry> lexicalEntries)
{
String result = "---";
int numberOfTransitiveFrames = 0;
int numberOfIntransitiveFrames = 0;
boolean withToInfinitive = false;
if (pos.equals("V")) {
for (LexicalEntry lexicalEntry : lexicalEntries) {
for (SyntacticBehaviour sb : lexicalEntry
.getSyntacticBehaviours()) {
try {
SubcategorizationFrame scf = sb
.getSubcategorizationFrame();
List<SyntacticArgument> synArgs = scf
.getSyntacticArguments();
for (SyntacticArgument synArg : synArgs) {
if (synArg.getSyntacticCategory().equals(
ESyntacticCategory.verbPhrase)
&& synArg.getVerbForm().equals(
EVerbForm.toInfinitive)) {
withToInfinitive = true;
}
}
if (synArgs.size() == 1) {
numberOfIntransitiveFrames++;
}
if (synArgs.size() >= 2) {
if (synArgs.get(0).getSyntacticCategory()
.equals(ESyntacticCategory.nounPhrase)
&& synArgs
.get(1)
.getSyntacticCategory()
.equals(ESyntacticCategory.nounPhrase)) {
numberOfTransitiveFrames++;
}
else if (synArgs.get(0).getSyntacticCategory()
.equals(ESyntacticCategory.nounPhrase)
&& synArgs
.get(1)
.getSyntacticCategory()
.equals(ESyntacticCategory.prepositionalPhrase)) {
numberOfIntransitiveFrames++;
}
}
}
catch (NullPointerException e) {
// sth wrong with subcat frame
}
}
}
}
if (numberOfTransitiveFrames == 0 && numberOfIntransitiveFrames == 0) {
result = "---";
}
else if (numberOfTransitiveFrames > 0) {
if (withToInfinitive) {
result = "transitive/with_to-infinitive";
}
else {
result = "transitive";
}
}
else if (numberOfTransitiveFrames == 0
&& numberOfIntransitiveFrames > 0) {
if (withToInfinitive) {
result = "intransitive/with_to-infinitive";
}
else {
result = "intransitive";
}
}
return result;
}
private void writeTokenAndSemanticField(String string)
{
try {
writer.write(string);
writer.flush();
}
catch (IOException e) {
e.printStackTrace();
}
}
private String getSemanticallyRelatedWords(Synset synset)
{
String result = null;
HashSet<String> semanticallyRelatedWords = new HashSet<String>();
for (SynsetRelation synsetRel : synset.getSynsetRelations()) {
try {
for (Sense s : synsetRel.getTarget().getSenses()) {
if (s.getIndex() == 1) {
semanticallyRelatedWords.add(s.getLexicalEntry()
.getLemmaForm());
}
}
}
catch (NullPointerException e) {
// sth wrong with target of synset relation
}
}
if (semanticallyRelatedWords.isEmpty()) {
result = "---";
}
else {
result = semanticallyRelatedWords.toString().replaceAll("\\[", "")
.replaceAll("\\]", "");
}
return result;
}
private String getSynonymousWords(String lemma, Synset synset)
{
String result = null;
HashSet<String> synonymousWords = new HashSet<String>();
for (Sense sense : synset.getSenses()) {
try {
if (!lemma.equals(sense.getLexicalEntry().getLemmaForm())) {
synonymousWords.add(sense.getLexicalEntry().getLemmaForm());
}
}
catch (NullPointerException e) {
// sth wrong with target of synset relation
}
}
if (synonymousWords.isEmpty()) {
result = "---";
}
else {
result = synonymousWords.toString().replaceAll("\\[", "")
.replaceAll("\\]", "");
}
return result;
}
private String getSemanticLabels(List<LexicalEntry> lexicalEntries)
{
String result = null;
HashSet<String> semanticLabelValues = new HashSet<String>();
for (LexicalEntry lexicalEntry : lexicalEntries) {
for (Sense s : lexicalEntry.getSenses()) {
try {
for (SemanticLabel sl : s.getSemanticLabels()) {
if (!sl.getType().equals(
ELabelTypeSemantics.verbnetClass)
&& !sl.getType().equals(
ELabelTypeSemantics.semanticField)) {
semanticLabelValues.add(sl.getLabel());
}
}
for (PredicativeRepresentation pr : s
.getPredicativeRepresentations()) {
semanticLabelValues.add(pr.getPredicate().getLabel()
.toLowerCase());
}
}
catch (NullPointerException e) {
// no SemanticLabel type or label of SemanticPredicate is
// missing
}
}
}
if (semanticLabelValues.isEmpty()) {
result = "---";
}
else {
result = semanticLabelValues.toString().replaceAll("\\[", "")
.replaceAll("\\]", "");
}
return result;
}
}