/** * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.tudarmstadt.ukp.lmf.transform.germanet; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.lmf.model.core.LexicalResource; import de.tudarmstadt.ukp.lmf.model.core.Lexicon; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech; import de.tudarmstadt.ukp.lmf.model.enums.ESenseAxisType; import de.tudarmstadt.ukp.lmf.model.meta.MetaData; import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation; import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis; import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef; import de.tudarmstadt.ukp.lmf.model.semantics.Synset; import de.tuebingen.uni.sfs.germanet.api.EwnRel; import de.tuebingen.uni.sfs.germanet.api.GermaNet; import de.tuebingen.uni.sfs.germanet.api.IliRecord; import de.tuebingen.uni.sfs.germanet.api.LexUnit; /** * This class contains methods for converting the German part of the Interlingual Index (ILI), * provided in <a href="URL#http://www.sfs.uni-tuebingen.de/lsd/index.shtml">GermaNet</a>, * to {@link SenseAxis} instances. * * @since UBY 0.2.0 * * */ public class InterlingualIndexConverter { private final GNConverter gnConverter; private final GermaNet gnet; private final Lexicon wordNetLexicon; private final List<SenseAxis> senseAxes = new ArrayList<SenseAxis>(); private final MetaData metaData; // UBY-LMF synsets sorted by external reference private final Map<EPartOfSpeech, Map<Long, Synset>> synsetMappings = new HashMap<EPartOfSpeech, Map<Long,Synset>>(); // mappings between part of speech keys and part of speech specified by Uby-LMF private static final Map<String, EPartOfSpeech> _posKeyMappings = new HashMap<String, EPartOfSpeech>(); static{ // Put the POS mappings pos Key <-> EPartOfSpeech _posKeyMappings.put("n", EPartOfSpeech.noun); _posKeyMappings.put("v", EPartOfSpeech.verb); _posKeyMappings.put("a", EPartOfSpeech.adjective); _posKeyMappings.put("r", EPartOfSpeech.adverb); } // mappings between part of speech labels and part of speech specified by Uby-LMF private static final Map<String, EPartOfSpeech> _posLabelMappings = new HashMap<String, EPartOfSpeech>(); static{ // Put the POS mappings pos label <-> EPartOfSpeech _posLabelMappings.put("noun", EPartOfSpeech.noun); _posLabelMappings.put("verb", EPartOfSpeech.verb); _posLabelMappings.put("adjective", EPartOfSpeech.adjective); _posLabelMappings.put("adverb", EPartOfSpeech.adverb); } private final Log logger = LogFactory.getLog(getClass()); /** * Constructs an instance of {@link InterlingualIndexConverter} based on the consumed * parameters. * * @since 0.2.0 * * @param gnConverter an instance of {@link GNConverter} associated with this generator * * @param gnet {@link GermaNet} instance used for accessing GermaNet data. * @param alignmentMetaData * * @param wordNetLexicon {@link LexicalResource} instance containing * <a href="URL#https://wordnet.princeton.edu/wordnet/">WordNet 3.0</a>. */ public InterlingualIndexConverter(GNConverter gnConverter, GermaNet gnet, Lexicon wordNetLexicon, MetaData alignmentMetaData) { this.gnConverter = gnConverter; this.gnet = gnet; this.wordNetLexicon = wordNetLexicon; this.metaData = alignmentMetaData; } /** * Starts the conversion process of GermaNets Interlingual Index to {@link SenseAxis} instances. * The generated sense axes can be obtained by invoking {@link #getSenseAxes()}. */ public void convert(){ createSynsetMappings(wordNetLexicon); SynsetGenerator synsetGenerator = gnConverter.getSynsetGenerator(); synsetGenerator.initialize(); List<IliRecord> iliRecords = gnet.getIliRecords(); int synsetAlignmentCounter = 0; int senseAlignmentCounter = 0; for(IliRecord iliRecord : iliRecords){ EwnRel relation = iliRecord.getEwnRelation(); if(relation.toString().equals("synonym")){ /* * Only synonyms are converted to sense axes */ String pwn30Id = iliRecord.getPwn30Id(); String offsetString = pwn30Id.replaceAll("ENG30-", ""); String[] temp = offsetString.split("-"); offsetString = temp[0]; EPartOfSpeech pos = getUbyPosFromKey(temp[1]); if(offsetString.contains("null")) { logger.warn("offsetString contains null-string for " + iliRecord); continue; // skip } long offset = Long.parseLong(offsetString); LexUnit lexUnit = gnet.getLexUnitByID(iliRecord.getLexUnitId()); Synset gnUBYSynset = synsetGenerator.getLMFSynset(lexUnit); /* * Obtain the UBY-LMF synset that corresponds to the WordNet 3.0 synset * targeted by the ILI-record */ Synset wnUBYSynset = synsetMappings.get(pos).get(offset); if(wnUBYSynset == null){ logger.warn("Synset for the given WordNet word could not be found. SenseAxis will not be generated." + iliRecord); continue; // skip } else{ /* * Create SenseAxis for Synset */ SenseAxis senseAxisSynset = new SenseAxis(); senseAxisSynset.setSynsetOne(gnUBYSynset); senseAxisSynset.setSynsetTwo(wnUBYSynset); senseAxisSynset.setSenseAxisType(ESenseAxisType.crosslingualSenseAlignment); senseAxisSynset.setId("GN_WN_Synset_Alignment_Interlingual_Index_"+synsetAlignmentCounter++); senseAxisSynset.setLexiconOne(gnConverter.getLexicalResource().getLexicons().get(0)); //available after calling toLMF? senseAxisSynset.setLexiconTwo(wordNetLexicon); senseAxisSynset.setMetaData(metaData); senseAxes.add(senseAxisSynset); } /* * Create SenseAxis for Sense */ String pwnWord = iliRecord.getPwnWord(); Sense gnUBYSense = gnConverter.getSynsetGenerator().getSense(lexUnit); Sense wnUBYSense = getSense(wnUBYSynset, pwnWord); if(wnUBYSense == null){ logger.warn("Sense for the given WordNet word ##" +pwnWord +"## could not be found. SenseAxis will not be generated." + iliRecord); continue; // skip } else{ SenseAxis senseAxisSense = new SenseAxis(); senseAxisSense.setSenseOne(gnUBYSense); senseAxisSense.setSenseTwo(wnUBYSense); senseAxisSense.setSenseAxisType(ESenseAxisType.crosslingualSenseAlignment); senseAxisSense.setId("GN_WN_Sense_Alignment_Interlingual_Index_"+senseAlignmentCounter++); senseAxisSense.setLexiconOne(gnConverter.getLexicalResource().getLexicons().get(0)); //available after calling toLMF? senseAxisSense.setLexiconTwo(wordNetLexicon); senseAxisSense.setMetaData(metaData); senseAxes.add(senseAxisSense); } } } } /** * Consumes a {@link Synset} instance that corresponds to a WordNet 3.0 synset and * a {@link String} representation of a word. It returns the * first {@link Sense} instance of the consumed UBY-LMF Synset, that belongs to a * LexicalEntry which has a lemma equal to the consumed word. * * @param wnUBYSynset synset that contains the senses to be queried * * @param pwnWord the returned must belong to a lexical entry with lemma equal to pwnWord * * @return sense that corresponds to the consumed word, or null if no sense in the * consumed synset belongs to a lexical entry with lemma that is equal to pwnWord */ private Sense getSense(Synset wnUBYSynset, String pwnWord) { List<Sense> senses = wnUBYSynset.getSenses(); for(Sense sense : senses) { for(FormRepresentation formRepresentation : sense.getLexicalEntry().getLemma().getFormRepresentations()) { if(formRepresentation.getWrittenForm().equals(pwnWord)) { return sense; } } } return null; } /** * Initializes {@link #synsetMappings} field. The field makes an efficient search for a * {@link Synset} possible, for a given {@link EPartOfSpeech} and WordNet 3.0 synset offset. * * @param wordNetLexicon {@link Lexicon} used for extracting the mappings */ private void createSynsetMappings(Lexicon wordNetLexicon) { List<Synset> synsets = wordNetLexicon.getSynsets(); synsetMappings.put(EPartOfSpeech.noun, new HashMap<Long, Synset>()); synsetMappings.put(EPartOfSpeech.verb, new HashMap<Long, Synset>()); synsetMappings.put(EPartOfSpeech.adjective, new HashMap<Long, Synset>()); synsetMappings.put(EPartOfSpeech.adverb, new HashMap<Long, Synset>()); for(Synset synset : synsets){ MonolingualExternalRef monolingualExternalRef = synset.getMonolingualExternalRefs().get(0); String posOffset = monolingualExternalRef.getExternalReference(); String[] temp = posOffset.split("]"); EPartOfSpeech pos = getUbyPosFromLabel(temp[0].split(" ")[1]); String stringOffset = temp[1].trim(); if(stringOffset.contains("null")) { logger.warn("stringOffset contains null-string"); continue; // skip } long offset = Long.parseLong(stringOffset); Map<Long, Synset> mapping = synsetMappings.get(pos); mapping.put(offset, synset); } } /** * This method consumes a {@link String} * and returns corresponding {@link EPartOfSpeech} * @param POS string as key, e.g. n, v, a * @return associated part of speech defined in UBY-LMF * @since 0.7.0 */ private static EPartOfSpeech getUbyPosFromKey(String pos) { EPartOfSpeech result = _posKeyMappings.get(pos); return result; } /** * This method consumes a {@link String} * and returns corresponding {@link EPartOfSpeech} * @param POS string as label, e.g. noun, verb * @return associated part of speech defined in UBY-LMF * @since 0.7.0 */ private static EPartOfSpeech getUbyPosFromLabel(String pos) { EPartOfSpeech result = _posLabelMappings.get(pos); return result; } /** * Returns the {@link List} of all {@link SenseAxis} instances, generated by this {@link InterlingualIndexConverter}. * * @return a list of sense axes generated by this converter, or an empty list if the converter * has not generated any sense axes */ public List<SenseAxis> getSenseAxes(){ return this.senseAxes; } //TODO fill in data public static MetaData getDefaultMetaData() { /* * Generate Metadata */ MetaData m = new MetaData(); m.setAutomatic(false); DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); Date creationDate; try { creationDate = formatter.parse("2014-04-01"); m.setCreationDate(creationDate); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } m.setCreationProcess("semi-automatic"); m.setCreationTool("http://www.sfs.uni-tuebingen.de/GermaNet/ili.shtml"); m.setVersion("GN 9.0"); m.setId("GNWN_ILI_0");//TODO return m; } }