/** * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.tudarmstadt.ukp.lmf.transform.wordnet; import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import net.sf.extjwnl.JWNLException; import net.sf.extjwnl.data.Word; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.core.TextRepresentation; import de.tudarmstadt.ukp.lmf.model.enums.EExampleType; import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel; import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef; import de.tudarmstadt.ukp.lmf.model.semantics.SenseExample; import de.tudarmstadt.ukp.lmf.model.semantics.Synset; import de.tudarmstadt.ukp.lmf.transform.wordnet.util.IndexSenseReader; /** * This class offers methods for generating instances of {@link Sense} class * from the Princeton WordNet. */ public class SenseGenerator { public final static String EXTERNAL_SYSTEM_SENSE_KEY = "senseKey"; private final IndexSenseReader isr; /* * Synset generator used for obtaining mappings between * WordNet's synsets and synsets defined in Uby-LMF */ private final SynsetGenerator synsetGenerator; // used for creating IDs of SenseExamples private int senseExampleNumber=0; // Running number for creating Sense-IDs private int lmfSenseNumber; // Mappings between lexemes and associated Senses private final Map<Word, Sense> lexemeSenseMappings; private final String resourceVersion; private final Log logger = LogFactory.getLog(getClass()); /** * Constructs a {@link SenseGenerator} based on the consumed parameters * @param synsetGenerator a SynsetGenerator used for obtaining Synsets * @param isr reader used for parsing WordNet's index.sense file * @param resourceVersion Verstion of the resource * @see Sense * @see Synset * @see IndexSenseReader */ public SenseGenerator(SynsetGenerator synsetGenerator, IndexSenseReader isr, String resourceVersion){ this.synsetGenerator = synsetGenerator; this.isr = isr; this.resourceVersion = resourceVersion; lexemeSenseMappings = new TreeMap<Word, Sense>(new Comparator<Word>() { @Override public int compare(Word o1, Word o2) { try { return o1.getSenseKey().compareTo(o2.getSenseKey()); } catch (JWNLException e) { throw new IllegalArgumentException(e); } } }); } /** * This method consumes a {@link Set} of lexemes and generates a list of Senses. <br> * Every {@link Sense} in the returned list is associated with one lexeme in the consumed Set. * * @param lexemeGroup a group of lexemes with equal lemma and part of speech * * @param lexicalEntry a {@link LexicalEntry} instance that contains generated Senses. * * @return list of Sense-instances, based on the consumed group of lexemes * * @since UBY 0.2.0 * * @see Word * */ public List<Sense> generateSenses(Set<Word> lexemeGroup, LexicalEntry lexicalEntry){ List<Sense> result = new ArrayList<Sense>(); // a list of Senses that need a dummy sense number List<Sense> needDummySenseNumber = new ArrayList<Sense>(); int nextIndex = 1; // dummy index // every lexeme has a sense of it's own for(Word lexeme : lexemeGroup){ Sense sense = new Sense(); lexemeSenseMappings.put(lexeme, sense); //set ID sense.setId(getNewID()); sense.setLexicalEntry(lexicalEntry); // setting index of the Sense (lexeme's Position in the WN-Synset) String senseNumber; try { senseNumber = isr.getSenseNumber(lexeme.getSenseKey()); } catch (JWNLException e) { throw new IllegalArgumentException(e); } if(senseNumber != null){ int index = Integer.parseInt(senseNumber); if(nextIndex <= index) { nextIndex = index+1; } sense.setIndex(index); } else{ // sense needs a dummy value for index needDummySenseNumber.add(sense); StringBuffer sb = new StringBuffer(128); sb.append("IndexSenseReader did not provide sense number for senseKey "); try { sb.append(lexeme.getSenseKey()).append('\n'); } catch (JWNLException e) { throw new IllegalArgumentException(e); } sb.append("adding a dummy value of sense number"); logger.warn(sb.toString()); } net.sf.extjwnl.data.Synset lexemeSynset = lexeme.getSynset(); // lexemes Synset //set Synset Synset lmfSynset = synsetGenerator.getLMFSynset(lexemeSynset); if(lmfSynset == null){ StringBuffer sb = new StringBuffer(512); sb.append("Synset generator did not provide Uby-LMF Synset for WordNet's Synset "); sb.append(lexemeSynset).append('\n'); sb.append("Closing VM"); logger.error(sb.toString()); System.exit(1); } sense.setSynset(lmfSynset); // set semanticLabel List<SemanticLabel> semanticLabels = new LinkedList<SemanticLabel>(); SemanticLabel semanticLabel = new SemanticLabel(); semanticLabels.add(semanticLabel); semanticLabel.setLabel(lexemeSynset.getLexFileName()); semanticLabel.setType(ELabelTypeSemantics.semanticField); sense.setSemanticLabels(semanticLabels); // Creating MonolingualExternalRef for a Sense MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef(); /**/ StringBuffer sb = new StringBuffer(32); sb.append(lexeme.getSynset().getPOS()); sb.append(" "); try { sb.append(lexeme.getSenseKey()); } catch (JWNLException e) { throw new IllegalArgumentException(e); } monolingualExternalRef.setExternalSystem("WordNet 3.0 part of speech and sense key"); monolingualExternalRef.setExternalReference(sb.toString()); /**/ //TODO: Check implications! // monolingualExternalRef.setExternalSystem(resourceVersion + "_" + EXTERNAL_SYSTEM_SENSE_KEY); // monolingualExternalRef.setExternalReference(lexeme.getSenseKey()); List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>(); monolingualExternalRefs.add(monolingualExternalRef); sense.setMonolingualExternalRefs(monolingualExternalRefs); //*** create sense examples of the sense *** // List<SenseExample> senseExamples = new ArrayList<SenseExample>(); List<String> exampleStrings = synsetGenerator.getExamples(lexeme); if(exampleStrings != null) { for(String exampleSentence : exampleStrings){ SenseExample senseExample = new SenseExample(); // Create an id for the senseExample StringBuffer id = new StringBuffer(32); id.append("WN_SenseExample_").append(senseExampleNumber++); senseExample.setId(id.toString()); senseExample.setExampleType(EExampleType.other); TextRepresentation textRepresentation = new TextRepresentation(); textRepresentation.setLanguageIdentifier(ELanguageIdentifier.ENGLISH); textRepresentation.setWrittenText(exampleSentence); senseExample.setTextRepresentations(new ArrayList<TextRepresentation>(Arrays.asList(textRepresentation))); senseExamples.add(senseExample); } } // setting senseExamples sense.setSenseExamples(senseExamples); // Add the created Sense to the result result.add(sense); } /* * Adding dummy indexes to senses if needed */ for(Sense sense : needDummySenseNumber) { sense.setIndex(nextIndex++); } return result; } /** * This method generates a Sense-ID. <br> * Every time the method is called, it increments the running number used for the creation of the ID. * @return an ID of a Sense-instance * @see Sense */ private String getNewID() { StringBuffer sb = new StringBuffer(64); sb.append("WN_Sense_").append(Integer.toString(lmfSenseNumber)); lmfSenseNumber++; return sb.toString(); } /** * Returns the Sense-instance associated with the consumed lexeme * @param lexeme a lexeme for which the generated Sense-intance should be returned * @return Sense-instance associated with the consumed lexeme,<br> * or null if this generator has not generated a Sense for the consumed lexeme * @see Sense * @see Word */ public Sense getSense(Word lexeme){ return lexemeSenseMappings.get(lexeme); } /** * Returns all lexemes processed by this {@link SenseGenerator} * @return all lexemes processed by this SenseGenerator * @see Word */ public Set<Word> getProcessedLexemes(){ return lexemeSenseMappings.keySet(); } }