/**
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package de.tudarmstadt.ukp.lmf.transform.germanet;
import java.io.InputStream;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import de.tudarmstadt.ukp.lmf.model.core.GlobalInformation;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier;
import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech;
import de.tudarmstadt.ukp.lmf.model.meta.MetaData;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tuebingen.uni.sfs.germanet.api.GermaNet;
import de.tuebingen.uni.sfs.germanet.api.LexUnit;
import de.tuebingen.uni.sfs.germanet.api.WordCategory;
/**
*
* Instance of this class converts
* <a href="URL#http://www.sfs.uni-tuebingen.de/lsd/index.shtml">GermaNet 7.0</a>
* to LMF-format
* @author Zijad Maksuti
* @author Judith Eckle-Kohler
*
*/
public class GNConverter {
private final GermaNet gnet; // GermaNet Object
private final LexicalResource lexicalResource;
private InputStream subcatStream; // subcat mapping file
/*
* Groups of LexUnits with equal lemma and part of speech
*/
private Set<Set<LexUnit>> luGroups;
private SubcategorizationFrameExtractor subcategorizationFrameExtractor;
private final SynsetGenerator synsetGenerator;
private final String dtd_version;
private final Log logger = LogFactory.getLog(getClass());
private final String resourceVersion;
private MetaData alignmentMetaData;
/**
* Constructs a {@link GNConverter} based on the consumed parameters
* @param germaNet initialized {@link GermaNet} object
* @param lexicalResource initialized object of {@link LexicalResource}, which will be filled with GermaNet's data
* @param alignmentMeta MetaData of ili Alignment
* @param resourceVersion Version of this resource
* @param dtd_version specifies the version of the .dtd which will be written to lexicalResource
*/
public GNConverter(GermaNet germaNet, LexicalResource lexicalResource, MetaData alignmentMeta, String resourceVersion, String dtd_version)
{
this.gnet = germaNet;
this.lexicalResource = lexicalResource;
this.alignmentMetaData = alignmentMeta;
if (alignmentMetaData==null){
alignmentMetaData = InterlingualIndexConverter.getDefaultMetaData();
}
alignmentMetaData.setLexicalResource(this.lexicalResource);
List<MetaData> metaList = this.lexicalResource.getMetaData();
metaList.add(this.alignmentMetaData);
this.lexicalResource.setMetaData(metaList);
this.dtd_version = dtd_version;
this.resourceVersion = resourceVersion;
try {
this.subcatStream = getClass().getClassLoader().getResource("GermaNetSubcatMappings/gnFrameMapping.txt").openStream();
subcategorizationFrameExtractor = new SubcategorizationFrameExtractor(subcatStream);
}
catch (Exception e) {
logger.error("GNConverter: unable to load subcat mapping file. Aborting all operations");
System.exit(1);
}
this.synsetGenerator = new SynsetGenerator(this.gnet, resourceVersion);
}
/**
* Converts the informations provided by the initialized {@link GermaNet} object to LMF-format. <br>
* The result of the conversion can be obtained by calling {@link GNConverter#getLexicalResource()}
*/
public void toLMF() {
// Setting attributes of LexicalResource
lexicalResource.setName("GermaNet");
lexicalResource.setDtdVersion(dtd_version);
// *** Setting GlobalInformation *** //
GlobalInformation globalInformation = new GlobalInformation();
globalInformation.setLabel("LMF representation of GermaNet 7.0");
lexicalResource.setGlobalInformation(globalInformation);
//*** Setting Lexicon (only one since GermaNet is monolingual)***//
Lexicon lexicon = new Lexicon();
lexicon.setLanguageIdentifier(ELanguageIdentifier.GERMAN);
lexicon.setId("GN_Lexicon_0");
lexicon.setName("GermaNet");
LinkedList<Lexicon> lexicons = new LinkedList<Lexicon>();
lexicons.add(lexicon);
lexicalResource.setLexicons(lexicons);
// *** Creating LexicalEntries *** //
logger.info("Generating LexicalEntries...");
this.groupLUs();
LexicalEntryGenerator leGen = new LexicalEntryGenerator(this, resourceVersion);
List<LexicalEntry> lexicalEntries = new LinkedList<LexicalEntry>();
// Create a LexicalEntry for each luGroup
for (Set<LexUnit> luGroup : luGroups) {
lexicalEntries.add(leGen.createLexicalEntry(luGroup));
}
// Setting RelatedForms of LexicalEntries
for(LexicalEntry lexicalEntry : lexicalEntries) {
leGen.setRelatedForms(lexicalEntry);
}
// appending lexicalEntries
lexicon.setLexicalEntries(lexicalEntries);
StringBuffer sb = new StringBuffer(64);
sb.append("Generated LexicalEntries: ").append(lexicalEntries.size());
logger.info(sb.toString());
int noVerbs = 0;
int noVerbSenses = 0;
for (LexicalEntry le : lexicalEntries) {
if (le.getPartOfSpeech().equals(EPartOfSpeech.verb)) {
noVerbs++;
noVerbSenses = noVerbSenses + le.getSenses().size();
}
}
sb = new StringBuffer(128);
sb.append("Generated verb lemmas: ").append(noVerbs).append('\n');
sb.append("Generated verb senses: ").append(noVerbSenses);
logger.info(sb.toString());
// *** Appending SubcategorizationFrames *** //
lexicon.setSubcategorizationFrames(subcategorizationFrameExtractor.getSubcategorizationFrames());
// *** Appending SemanticPredicates *** //
lexicon.setSemanticPredicates(subcategorizationFrameExtractor.getSemanticPredicates());
// *** Appending Synsets *** //
synsetGenerator.initialize();
List<Synset> synsets = synsetGenerator.getSynsets();
lexicon.setSynsets(synsets);
sb = new StringBuffer(64);
sb.append("Generated synsets: ").append(synsets.size());
logger.info(sb.toString());
// *** Appending SynSemCorrespondences *** //
lexicon.setSynSemCorrespondences(subcategorizationFrameExtractor.getSynSemCorrespondences());
}
/**
*
* @param wordNetLexicon
*/
public void toLMF(Lexicon wordNetLexicon){
toLMF();
InterlingualIndexConverter iliConverter = new InterlingualIndexConverter(this, gnet, wordNetLexicon, alignmentMetaData);
iliConverter.convert();
lexicalResource.setSenseAxes(iliConverter.getSenseAxes());
}
/**
* This method groups all LexUnits by lemma and {@link WordCategory}
* @see LexUnit
*/
private void groupLUs() {
if (luGroups == null) {
luGroups = new LinkedHashSet<Set<LexUnit>>();
}
for (WordCategory pos : WordCategory.values()) {
List<LexUnit> lus = gnet.getLexUnits(pos);
Map<String, Set<LexUnit>> orthFormLUGroupMappings =
new TreeMap<String, Set<LexUnit>>();
for (LexUnit lu : lus) {
String orthForm = lu.getOrthForm();
Set<LexUnit> luGroup = orthFormLUGroupMappings.get(orthForm);
if(luGroup == null) {
luGroup = new LinkedHashSet<LexUnit>();
orthFormLUGroupMappings.put(orthForm, luGroup);
}
luGroup.add(lu);
}
luGroups.addAll(orthFormLUGroupMappings.values());
}
}
/**
* This method consumes a {@link LexUnit} and returns a {@link Set} of
* LexUnits with equal lemma and {@link WordCategory}
* @param lexUnit an instance of LexUnit for which
* a Set of LexUnits with equal lemma and WordCategory should be returned
* @return a set of LexUnits with equal lemma and WordCategory to consumed lexUnit
*/
protected Set<LexUnit> getLUGroup(LexUnit lexUnit){
for (Set<LexUnit> luGroup : luGroups) {
if(luGroup.contains(lexUnit)) {
return luGroup;
}
}
return null;
}
/**
* Returns the {@link GermaNet} object associated to this {@link GNConverter}
* @return GermaNet object associated to this GNConveter
*/
public GermaNet getGnet() {
return gnet;
}
/**
* Returns the {@link SubcategorizationFrameExtractor} associated to this {@link GNConverter}
* @return the SubcategorizationFrameExtractor associated to this GNConverter
*/
public SubcategorizationFrameExtractor getSubcategorizationFrameExtractor() {
return subcategorizationFrameExtractor;
}
/**
* Returns the {@link LexicalResource} object, which contains the results of the conversion
* @return an instance of LexicalResource, which contains the results of the conversion
*/
public LexicalResource getLexicalResource() {
return lexicalResource;
}
/**
* Returns the {@link SynsetGenerator} instance associated with this {@link GNConverter}.
*
* @return the synset generator associated with this converter
*/
public SynsetGenerator getSynsetGenerator() {
return synsetGenerator;
}
}