/**
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.tudarmstadt.ukp.lmf.transform.wordnet;
import java.io.File;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import net.sf.extjwnl.JWNLException;
import net.sf.extjwnl.data.POS;
import net.sf.extjwnl.data.Synset;
import net.sf.extjwnl.data.Word;
import net.sf.extjwnl.dictionary.Dictionary;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.Sense;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation;
import de.tudarmstadt.ukp.lmf.model.morphology.Lemma;
import de.tudarmstadt.ukp.lmf.model.semantics.PredicativeRepresentation;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticBehaviour;
import de.tudarmstadt.ukp.lmf.transform.wordnet.util.IndexSenseReader;
import de.tudarmstadt.ukp.lmf.transform.wordnet.util.WNConvUtil;
/**
* Instance of this class offers methods for creating {@link LexicalEntry} out of WordNet's data
* @author Zijad Maksuti
* @author Judith Eckle-Kohler
*
*/
public class LexicalEntryGenerator {
/*
* Mappings of lexemes with equal lemma and part of speech with associated LexicalEntries
*/
private Map<Set<Word>, LexicalEntry> lexemeGroupLexicalEntryMaping;
private final Map<POS,Map<String, Set<Word>>> posLemmaLexemeGroup = new LinkedHashMap<POS, Map<String, Set<Word>>>();
private final Set<Set<Word>> lexemeGroups = new LinkedHashSet<Set<Word>>();
private final Map<Word, Set<Word>> lexemeToGroupMappings;
private Dictionary extWordnet;
/*
* All generated LexicalEntries
*/
private final List<LexicalEntry> lexicalEntries = new LinkedList<LexicalEntry>();
private int lexicalEntryNumber; // used for creating IDs of LexicalEntries
private int syntacticBehaviourNumber; // used for creating syntacticBehaviour IDs
private boolean initialized = false; // true only when lexicalEntryGenerator is initialized
private SenseGenerator senseGenerator; // instance of SenseGenerator used
private final SubcategorizationFrameExtractor subcategorizationFrameExtractor; // used for creating SyntacticBehaviours
/*
* This map prevents creating identical SyntacticBehaviours with different IDs
* Key of the map is SyntacticBehaviour's string representation without ID
* value is the corresponding SyntacticBehavour
*/
private final Map<String, SyntacticBehaviour> syntBeh = new TreeMap<String, SyntacticBehaviour>();
private final Log logger = LogFactory.getLog(getClass());
/**
* Constructs a {@link LexicalEntryGenerator} used for generating LexicalEntries
* @param dictionaryPath the path of the WordNet dictionary files
* @param extWordnet an instance of initialized WordNet-{@link Dictionary} used for accessing WordNet's information
* @param synsetGenerator an instance of {@link SynsetGenerator} used for generating {@link Synset}-instances
* @param subcategorizationFrameExtractor an instance of {@link SubcategorizationFrameExtractor} used for generating {@link SubcategorizationFrame}-instances
* @param Version of the resource
* @see {@link LexicalEntry}
*/
public LexicalEntryGenerator(File dictionaryPath, Dictionary extWordnet, SynsetGenerator synsetGenerator,
SubcategorizationFrameExtractor subcategorizationFrameExtractor, String resourceVersion){
this.subcategorizationFrameExtractor = subcategorizationFrameExtractor;
lexemeToGroupMappings = new TreeMap<Word, Set<Word>>(new Comparator<Word>() {
@Override
public int compare(Word o1, Word o2) {
try {
return o1.getSenseKey().compareTo(o2.getSenseKey());
}
catch (JWNLException e) {
throw new IllegalArgumentException(e);
}
}
});
if(!initialized){
this.extWordnet = extWordnet;
lexicalEntryNumber = 0;
syntacticBehaviourNumber = 0;
groupLexemes();
IndexSenseReader isr = new IndexSenseReader();
isr.initialize(new File(dictionaryPath, "index.sense"));
senseGenerator = new SenseGenerator(synsetGenerator, isr, resourceVersion);
createLexicalEntries();
initialized = true;
}
}
/**
* This method groups all lexemes contained in WordNet 3.0 by lemma and part of speech
*/
private void groupLexemes() {
byte percentage = 0;
logger.info(" grouping lexemes...");
lexemeGroupLexicalEntryMaping= new LinkedHashMap<Set<Word>, LexicalEntry>();
Iterator<Synset> synsetIter = null; // synset iterator
for(POS pos : POS.getAllPOS()){ // Iterate over all POSes
logger.info(percentage+"%");
Map<String, Set<Word>>lemmaLexemeGroup = new TreeMap<String, Set<Word>>();
try {
synsetIter = extWordnet.getSynsetIterator(pos);
} catch (JWNLException e) {
e.printStackTrace();
}
while(synsetIter.hasNext()){ // Iterate over all Synsets (Lemmas)
Synset synset = synsetIter.next();
List<Word> lexemes = synset.getWords(); // lexemes of the Synset
for(Word lexeme : lexemes){
Set<Word> lexemeGroup; // group of lexemes with equal lemma
String lemma = lexeme.getLemma(); // lemma's lexeme
if((lexemeGroup = lemmaLexemeGroup.get(lemma)) == null){
lexemeGroup = new TreeSet<Word>(new Comparator<Word>() {
@Override
public int compare(Word o1, Word o2) {
try {
return o1.getSenseKey().compareTo(o2.getSenseKey());
}
catch (JWNLException e) {
throw new IllegalArgumentException(e);
}
}
});
lemmaLexemeGroup.put(lemma, lexemeGroup);
}
lexemeGroup.add(lexeme);
lexemeToGroupMappings.put(lexeme, lexemeGroup);
}
}
posLemmaLexemeGroup.put(pos, lemmaLexemeGroup);
lexemeGroups.addAll(lemmaLexemeGroup.values());
percentage +=25;
}
logger.info("100%");
}
/**
* This method iterates over all {@link LexicalEntryGenerator#lexemeGroups} and
* creates a list of LexicalEntries for every group
* @see LexicalEntry
*/
private void createLexicalEntries(){
logger.info("transforming lexeme groups... 0% ");
int size = lexemeGroups.size();
int tenPercent = size/10;
int percentageCounter=0;
int percentage = 0;
for(Set<Word> lexemeGroup : lexemeGroups){
LexicalEntry lexicalEntry = createLexicalEntry(lexemeGroup);
lexicalEntries.add(lexicalEntry);
lexemeGroupLexicalEntryMaping.put(lexemeGroup, lexicalEntry);
if(percentageCounter++ == tenPercent){
percentage +=10;
percentageCounter = 0;
logger.info(percentage+"%");
}
}
logger.info("100%");
}
/**
* This method consumes a lexemeGroup and generates
* the corresponding {@link LexicalEntry}-instance
* @param lexemeGroup a group of lexemes with equal lemma and part of speech
* @return a LexicalEntry that corresponds to lexemeGroup
*/
private LexicalEntry createLexicalEntry(Set<Word> lexemeGroup) {
// Create a new LexicalEntry for the consumed group
LexicalEntry lexicalEntry = new LexicalEntry();
// Create ID for this lexicalEntry
lexicalEntry.setId(createID());
// codes of subcat frames
List<Map<String, Word>> subcatCodes = new LinkedList<Map<String, Word>>();
boolean posSet = false; // True when POS is set to the LexicalEntry
String lemmaString = null; // Lemmas Written form
for(Word lexeme : lexemeGroup){
if(!posSet){
// Extract the POS of the first Lexeme in the group
lexicalEntry.setPartOfSpeech(WNConvUtil.getPOS(lexeme.getPOS()));
posSet = true;
// Extract lemma
lemmaString = lexeme.getLemma();
}
EPartOfSpeech lePOS = lexicalEntry.getPartOfSpeech();
if(lePOS.equals(EPartOfSpeech.verb)){
// Extracting the verb frame
String[] frames = lexeme.getSynset().getVerbFrames();
for(String frame : frames){
Map<String, Word> codeLexeme = new TreeMap<String, Word>();
codeLexeme.put(frame, lexeme);
subcatCodes.add(codeLexeme); // the codes will be processed later
}
}
// extracting the subcat frame of an adjective
String synMarker;
try {
if(lePOS.equals(EPartOfSpeech.adjective) && (synMarker = lexeme.getSenseKeyWithAdjClass()).contains("(")){
int start = synMarker.indexOf("(");
String adjFrameCode = synMarker.substring(start+1, synMarker.indexOf(")"));
Map<String, Word> codeLexeme = new TreeMap<String, Word>();
codeLexeme.put(adjFrameCode, lexeme); // the codes will be processed later
subcatCodes.add(codeLexeme);
}
}
catch (JWNLException e) {
throw new IllegalArgumentException(e);
}
}
//*** Creating Lemma ***//
Lemma lemma = new Lemma();
List<FormRepresentation> formRepresentations = new LinkedList<FormRepresentation>();
FormRepresentation formRepresentation = new FormRepresentation();
formRepresentation.setLanguageIdentifier(ELanguageIdentifier.ENGLISH);
formRepresentation.setWrittenForm(lemmaString);
formRepresentations.add(formRepresentation);
lemma.setFormRepresentations(formRepresentations);
lexicalEntry.setLemma(lemma);
//*** Creating Senses ***//
lexicalEntry.setSenses(senseGenerator.generateSenses(lexemeGroup, lexicalEntry));
//*** Creating SyntacticBehaviours***//
if(!subcatCodes.isEmpty()){
Set<SyntacticBehaviour> syntacticBehaviours = new TreeSet<SyntacticBehaviour>();
for(Map<String, Word> mapping : subcatCodes){
// create a SyntacticBehaviour for every subcat code
SyntacticBehaviour syntacticBehaviour = new SyntacticBehaviour();
for(String frame : mapping.keySet()){
Word lexeme = mapping.get(frame);
Sense sense = senseGenerator.getSense(lexeme);
syntacticBehaviour.setSense(sense);
SubcategorizationFrame subcategorizationFrame = subcategorizationFrameExtractor.getSubcategorizationFrame(frame);
syntacticBehaviour.setSubcategorizationFrame(subcategorizationFrame);
// Updating PredicativeRepresentations of the sense
SemanticPredicate semanticPredicate = subcategorizationFrameExtractor.getSemanticPredicate(frame);
if(semanticPredicate != null){
// PredicativeRepresentation will only be updated if sementicPredicate for this Sense exists
List<PredicativeRepresentation> predicativeRepresentations = new LinkedList<PredicativeRepresentation>();
PredicativeRepresentation predicativeRepresentation = new PredicativeRepresentation();
predicativeRepresentation.setPredicate(semanticPredicate);
predicativeRepresentations.add(predicativeRepresentation);
if(sense.getPredicativeRepresentations() != null && !sense.getPredicativeRepresentations().isEmpty()) {
sense.getPredicativeRepresentations().addAll(predicativeRepresentations);
}
else {
sense.setPredicativeRepresentations(predicativeRepresentations);
}
}
}
/*
* check in the mapping, if an equivalent SyntactiBehaviour was already created
*/
String synBehString = createString(syntacticBehaviour);
SyntacticBehaviour created = syntBeh.get(synBehString);
if(created != null) {
syntacticBehaviours.add(created);
}
else {
// set the id of the new SyntacticBehaviour
// and make a record
StringBuffer sb = new StringBuffer(64);
sb.append("WN_SyntacticBehaviour_").append(syntacticBehaviourNumber++);
syntacticBehaviour.setId(sb.toString());
syntBeh.put(synBehString, syntacticBehaviour);
syntacticBehaviours.add(syntacticBehaviour);
}
}
lexicalEntry.setSyntacticBehaviours(new ArrayList<SyntacticBehaviour>(syntacticBehaviours));
}
return lexicalEntry;
}
/**
* This method creates a string-representation of a {@link SyntacticBehaviour}
* without SyntacticBehaviour's ID
* @param syntacticBehaviour
* @return syntacticBehaviour's string-representation without the it's id
*/
private String createString(SyntacticBehaviour syntacticBehaviour) {
StringBuffer sb = new StringBuffer(64);
sb.append(syntacticBehaviour.getSense()).append(syntacticBehaviour.getSubcategorizationFrame());
sb.append(syntacticBehaviour.getSubcategorizationFrameSet());
return sb.toString();
}
/**
* This method creates an ID for a {@link LexicalEntry}. <br>
* The running number used for the creation of the id is incremented every time this method is called.
* @return ID of a lexicalEntry
*/
private String createID() {
StringBuffer sb = new StringBuffer(32);
sb.append("WN_LexicalEntry_").append(lexicalEntryNumber++);
return sb.toString();
}
/**
* Returns all LexicalEntries generated by this generator
* @return all LexicalEntries generated by this generator
* @see {@link LexicalEntry}
*/
public List<LexicalEntry> getLexicalEntries() {
return lexicalEntries;
}
/**
* Returns a LexicalEntry generated for the consumed lexemeGroup
* @param lexemeGroup a group of lexemes for which a LexicalEntry should be returned
* @return the LexicalEntry that corresponds to the consumed lexemeGroup
* @see LexicalEntry
*/
LexicalEntry getLexicalEntry(Set<Word> lexemeGroup){
return lexemeGroupLexicalEntryMaping.get(lexemeGroup);
}
/**
* Returns a {@link LexicalEntry} that corresponds to the consumed lexeme
* @param lexeme an instance of {@link Word} for which a LexicalEntry should be returned
* @return the LexicalEntry generated for the consumed lexeme
*/
public LexicalEntry getLexicalEntry(Word lexeme){
return this.getLexicalEntry(this.getGroup(lexeme));
}
/**
* Returns the mappings between lexeme groups and associated LexicalEntries of this generator
* @return this generator's {@link LexicalEntryGenerator#lexemeGroupLexicalEntryMaping}
* @see LexicalEntry
* @see Word
*/
Map<Set<Word>, LexicalEntry> getLexemeGroupLexicalEntryMaping() {
return lexemeGroupLexicalEntryMaping;
}
/**
* Returns the {@link SenseGenerator} used by this generator
* @return the senseGenerator used by this generator
* @see Sense
*/
SenseGenerator getSenseGenerator() {
return senseGenerator;
}
/**
* This method returns a group of lexemes which contains the consumed lexeme
* @param lexeme the lexeme which group should be returned
* @return the lexeme-group of the consumed lexeme
* @see Word
*/
Set<Word> getGroup(Word lexeme){
return lexemeToGroupMappings.get(lexeme);
}
}