/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.lmf.transform.omegawiki; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Set; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.core.Statement; import de.tudarmstadt.ukp.lmf.model.core.TextRepresentation; import de.tudarmstadt.ukp.lmf.model.enums.ECase; import de.tudarmstadt.ukp.lmf.model.enums.EExampleType; import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalGender; import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalNumber; import de.tudarmstadt.ukp.lmf.model.enums.ELabelNameSemantics; import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics; import de.tudarmstadt.ukp.lmf.model.enums.EStatementType; import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticProperty; import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel; import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation; import de.tudarmstadt.ukp.lmf.model.morphology.Lemma; import de.tudarmstadt.ukp.lmf.model.morphology.WordForm; import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef; import de.tudarmstadt.ukp.lmf.model.semantics.SenseExample; import de.tudarmstadt.ukp.lmf.model.semantics.Synset; import de.tudarmstadt.ukp.lmf.model.syntax.LexemeProperty; import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame; import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticBehaviour; import de.tudarmstadt.ukp.omegawiki.api.Annotation; import de.tudarmstadt.ukp.omegawiki.api.DefinedMeaning; import de.tudarmstadt.ukp.omegawiki.api.SynTrans; import de.tudarmstadt.ukp.omegawiki.exception.OmegaWikiException; /** * This class generates senses * @author matuschek * */ class SenseGenerator { public static final String SYNTRANS = "synTrans"; private final String resourceVersion; private final String GlobalLanguageLMF; private final int GlobalLanguage; /* * Synset generator is needed for recovering * the LMFSynset-OWSynset mappings */ private final SynsetGenerator synsetGenerator; private int LMFSenseNumber; // Running number for creating Sense-IDs private int SCFNumber; // Running number for creating SubCatFrames private int SBNumber;// Running number for creating SyntacticBehaviour // Mappings between lexemes and corresponding Senses private final HashMap<SynTrans, Sense> lexemeSenseMappings = new HashMap<SynTrans, Sense>(); protected static int exampleIdx = 1; /** * Constructs a SenseGenerator * @param omegawiki * @param synsetGenerator a SynsetGenerator * @param resourceVersion Version of the resource */ public SenseGenerator(SynsetGenerator synsetGenerator, String resourceVersion){ this.synsetGenerator = synsetGenerator; this.GlobalLanguageLMF = synsetGenerator.getGlobalLanguageLMF(); this.GlobalLanguage = synsetGenerator.getGlobalLanguage(); this.resourceVersion = resourceVersion; } /** * This method consumes a Set of lexemes (SynTrans in OW) * and returns a List of Senses for this group of lexemes * @param lexemeGroup * @param lexicalEntry * @throws OmegaWikiException * @throws UnsupportedEncodingException */ public List<Sense> generateSenses(Set<SynTrans> lexemeGroup, LexicalEntry lexicalEntry) throws UnsupportedEncodingException, OmegaWikiException{ List<Sense> result = new LinkedList<Sense>(); // Every lexeme has a sense of its own for(SynTrans lexeme : lexemeGroup){ Set<Annotation> annos = lexeme.getAnnotations(); //all annotations of this lexeme String hyphenation = ""; String phonetic = ""; String example = null; String etymology=null; String otherStatement=null; SemanticLabel sl = null; SyntacticBehaviour sb = null; LexemeProperty lp= null; boolean urlStat = false; for(Annotation anno : annos) { String name = anno.getName(); String value = anno.getValue(); if (name.equals("hyphenation")) { hyphenation = value; Lemma lemma = lexicalEntry.getLemma(); List<FormRepresentation> formRepresentations = lemma.getFormRepresentations(); FormRepresentation formRepresentation = formRepresentations.get(0); formRepresentation.setHyphenation(hyphenation); } else if (name.equals("alfabeto fonético internacional"))//phonetic form { phonetic = value; Lemma lemma = lexicalEntry.getLemma(); List<FormRepresentation> formRepresentations = lemma.getFormRepresentations(); FormRepresentation formRepresentation = formRepresentations.get(0); formRepresentation.setPhoneticForm(phonetic); } else if (name.equals("Genus")) { Lemma lemma = lexicalEntry.getLemma(); WordForm wf; if(lexicalEntry.getWordForms()!=null && lexicalEntry.getWordForms().size()>0 ) { wf = lexicalEntry.getWordForms().get(0); } else { wf = new WordForm(); } List<FormRepresentation> formRepresentations = lemma.getFormRepresentations(); wf.setFormRepresentations(formRepresentations); if(value.equals("maskulinum")) { wf.setGrammaticalGender(EGrammaticalGender.masculine); } else if (value.equals("weiblich")) { wf.setGrammaticalGender(EGrammaticalGender.feminine); } else if (value.equals("neutrum")) { wf.setGrammaticalGender(EGrammaticalGender.neuter); } if(lexicalEntry.getWordForms()==null) { lexicalEntry.setWordForms(new LinkedList<WordForm>()); lexicalEntry.getWordForms().add(wf); } } else if (name.equals("Kasus")) //Grammatical case { Lemma lemma = lexicalEntry.getLemma(); WordForm wf; if(lexicalEntry.getWordForms()!=null && lexicalEntry.getWordForms().size()>0) { wf = lexicalEntry.getWordForms().get(0); } else { wf = new WordForm(); } List<FormRepresentation> formRepresentations = lemma.getFormRepresentations(); wf.setFormRepresentations(formRepresentations); if(value.equals("Akkusativ")) { wf.setCase(ECase.accusative); } else if(value.equals("Dativ")) { wf.setCase(ECase.dative); } else if(value.equals("Nominativ")) { wf.setCase(ECase.nominative); } else if(value.equals("Genitiv")) { wf.setCase(ECase.genitive); } if(lexicalEntry.getWordForms()==null) { lexicalEntry.setWordForms(new LinkedList<WordForm>()); lexicalEntry.getWordForms().add(wf); } } else if (name.equals("Numerus"))//Grammatical number { Lemma lemma = lexicalEntry.getLemma(); WordForm wf; if(lexicalEntry.getWordForms()!=null && lexicalEntry.getWordForms().size()>0) { wf = lexicalEntry.getWordForms().get(0); } else { wf = new WordForm(); } List<FormRepresentation> formRepresentations = lemma.getFormRepresentations(); wf.setFormRepresentations(formRepresentations); if(value.equals("Singular")) { wf.setGrammaticalNumber(EGrammaticalNumber.singular); } else if (value.equals("Plural")) { wf.setGrammaticalNumber(EGrammaticalNumber.plural); } else if (value.equals("Dual")) { wf.setGrammaticalNumber(EGrammaticalNumber.plural); } if(lexicalEntry.getWordForms()==null) { lexicalEntry.setWordForms(new LinkedList<WordForm>()); lexicalEntry.getWordForms().add(wf); } } else if (name.equals("grammatical property")||name.equals("property")) //Other properties { if (value.equals("Singularetantum")) { sl = new SemanticLabel(); sl.setType(ELabelTypeSemantics.semanticNounClass); sl.setLabel(ELabelNameSemantics.SEMANTIC_NOUN_CLASS_ONLY_SINGULAR); //lexicalEntry.setSingularetantum(EYesNo.yes); } else if (value.equals("Pluraletantum")) { sl = new SemanticLabel(); sl.setType(ELabelTypeSemantics.semanticNounClass); sl.setLabel(ELabelNameSemantics.SEMANTIC_NOUN_CLASS_ONLY_PLURAL); //lexicalEntry.setPluraletantum(EYesNo.yes); } else if (value.equals("intransitive")|| value.equals("transitive")|| value.equals("impersonal")|| value.equals("reflexive")) { SubcategorizationFrame scf= new SubcategorizationFrame(); scf.setSubcatLabel(value); scf.setId(getNewSCFID()); if(lexicalEntry.getLexicon().getSubcategorizationFrames()==null) { lexicalEntry.getLexicon().setSubcategorizationFrames(new LinkedList<SubcategorizationFrame>()); } lexicalEntry.getLexicon().getSubcategorizationFrames().add(scf); sb = new SyntacticBehaviour(); sb.setSubcategorizationFrame(scf); sb.setId(getNewSBID()); if(lexicalEntry.getSyntacticBehaviours()==null) { lexicalEntry.setSyntacticBehaviours(new LinkedList<SyntacticBehaviour>()); } lexicalEntry.getSyntacticBehaviours().add(sb); } else if (value.equals("attributive")) { lp = new LexemeProperty(); lp.setSyntacticProperty(ESyntacticProperty.nonPredicativeAdjective); } else if (value.equals("predicative")) { lp = new LexemeProperty(); lp.setSyntacticProperty(ESyntacticProperty.predicativeAdjective); } //else if (value.equals("impersonal")) //{ // lp = new LexemeProperty(); // lp.setSyntacticProperty(ESyntacticProperty.impersonal); //} //else if (value.equals("reflexive")) //{ // lp = new LexemeProperty(); // lp.setSyntacticProperty(ESyntacticProperty.reflexive); //} else if (value.equals("separable")) { lexicalEntry.setSeparableParticle("yes"); // TODO is it a boolean? } else if (value.equals("inseparable")) { lexicalEntry.setSeparableParticle("no"); } } else if (name.equals("example sentence")) { example=value; } else if (name.equals("etymology")) { etymology=value; } else if (name.equals("usage")) { sl = new SemanticLabel(); if(value.equals("vulgar")||value.equals("technical")||value.equals("poetic")||value.equals("pejorative")||value.equals("offensive")||value.equals("colloquial")||value.equals("medical")||value.equals("juvenile")||value.equals("informal")||value.equals("humorous")||value.equals("euphemistic")||value.equals("kindersprache")) { sl.setType(ELabelTypeSemantics.usage); } else if(value.equals("archaic")||value.equals("alte deutsche Schreibweise")||value.equals("dated")||value.equals("neologism")||value.equals("obsolete")) { sl.setType(ELabelTypeSemantics.timePeriodOfUsage); } else { // sl.setType(ELabelTypeSemantics.regionOfUsage); } //sl.setType("time"); sl.setLabel(value); } else if(!name.equals("part of speech")) { otherStatement=value; if (value.startsWith("http:")) { urlStat= true; } } } Sense sense = new Sense(); lexemeSenseMappings.put(lexeme, sense); //set ID sense.setId(getNewID()); if (sl != null && sl.getType() != null) { String label = sl.getLabel(); if (label.length() > 255) label = label.substring(0, 255); sl.setLabel(label); sl.setParent(sense); if(sense.getSemanticLabels()==null) { sense.setSemanticLabels(new LinkedList<SemanticLabel>()); } sense.getSemanticLabels().add(sl); } if(sb!=null) { sb.setSense(sense); if(lp!=null) { sb.getSubcategorizationFrame().setLexemeProperty(lp); } } else if(lp!=null) { SubcategorizationFrame scf= new SubcategorizationFrame(); scf.setSubcatLabel(""); scf.setId(getNewSCFID()); if(lexicalEntry.getLexicon().getSubcategorizationFrames()==null) { lexicalEntry.getLexicon().setSubcategorizationFrames(new LinkedList<SubcategorizationFrame>()); } lexicalEntry.getLexicon().getSubcategorizationFrames().add(scf); sb = new SyntacticBehaviour(); sb.setSubcategorizationFrame(scf); sb.setId(getNewSBID()); sb.setSense(sense); if(lexicalEntry.getSyntacticBehaviours()==null) { lexicalEntry.setSyntacticBehaviours(new LinkedList<SyntacticBehaviour>()); } lexicalEntry.getSyntacticBehaviours().add(sb); scf.setLexemeProperty(lp); } // setting index of the Sense (lexeme's SynTrans ID) sense.setIndex(lexicalEntry.getSenses().size() + 1); lexicalEntry.getSenses().add(sense); DefinedMeaning lexemesSynset = lexeme.getDefinedMeaning(); // Lexeme's DM //set Synset Synset lmfSynset = synsetGenerator.getLMFSynset(lexemesSynset); if(lmfSynset == null){ System.err.println("Error, SenseGenerator: Could not find lmfSynset for Synset: "+ lexemesSynset); System.exit(1); } sense.setSynset(lmfSynset); sense.setLexicalEntry(lexicalEntry); if(lmfSynset.getSenses() == null) { lmfSynset.setSenses(new LinkedList<Sense>()); } lmfSynset.getSenses().add(sense); sense.setDefinitions(sense.getSynset().getDefinitions()); if(example!=null) { if(sense.getSenseExamples()==null) { sense.setSenseExamples(new LinkedList<SenseExample>()); } SenseExample se = new SenseExample(); se.setId("OW_" + this.GlobalLanguageLMF + "_SenseExample_" + (exampleIdx++)); se.setExampleType(EExampleType.senseInstance); se.setTextRepresentations(new LinkedList<TextRepresentation>()); TextRepresentation tr = new TextRepresentation(); tr.setLanguageIdentifier(OmegaWikiLMFMap.mapLanguage(GlobalLanguage)); tr.setWrittenText(example); se.getTextRepresentations().add(tr); sense.getSenseExamples().add(se); } if(etymology !=null && sense.getDefinitions()!=null && sense.getDefinitions().size()>0) { if(sense.getDefinitions().get(0).getStatements()==null) { sense.getDefinitions().get(0).setStatements(new LinkedList<Statement>()); } Statement stat = new Statement(); stat.setStatementType(EStatementType.etymology); stat.setTextRepresentations(new LinkedList<TextRepresentation>()); TextRepresentation tr = new TextRepresentation(); tr.setLanguageIdentifier(OmegaWikiLMFMap.mapLanguage(GlobalLanguage)); tr.setWrittenText(etymology); stat.getTextRepresentations().add(tr); sense.getDefinitions().get(0).getStatements().add(stat); } if(otherStatement !=null && sense.getDefinitions()!=null && sense.getDefinitions().size()>0) { if(sense.getDefinitions().get(0).getStatements()==null) { sense.getDefinitions().get(0).setStatements(new LinkedList<Statement>()); } Statement stat = new Statement(); stat.setStatementType(EStatementType.encyclopedicInformation); if(urlStat) { stat.setStatementType(EStatementType.externalReference); } stat.setTextRepresentations(new LinkedList<TextRepresentation>()); TextRepresentation tr = new TextRepresentation(); tr.setLanguageIdentifier(OmegaWikiLMFMap.mapLanguage(GlobalLanguage)); tr.setWrittenText(otherStatement); stat.getTextRepresentations().add(tr); sense.getDefinitions().get(0).getStatements().add(stat); } // Creating MonolingualExternalRef for a Sense MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef(); String senseKey; senseKey = lexeme.getSyntransid()+""; // create an external reference StringBuffer stb = new StringBuffer(32); stb.append(senseKey); monolingualExternalRef.setExternalSystem(resourceVersion + "_" + SYNTRANS); monolingualExternalRef.setExternalReference(stb.toString()); List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>(); monolingualExternalRefs.add(monolingualExternalRef); sense.setMonolingualExternalRefs(monolingualExternalRefs); // Add the created Sense to the result result.add(sense); } return result; } /** * This method generates a Sense-ID */ private String getNewID() { StringBuffer sb = new StringBuffer(64); sb.append("OW_"+GlobalLanguageLMF.toString()+"_Sense_").append(Integer.toString(LMFSenseNumber)); LMFSenseNumber++; return sb.toString(); } /** * This method generates a SubCatFrame-ID */ private String getNewSCFID() { StringBuffer sb = new StringBuffer(64); sb.append("OW_"+GlobalLanguageLMF.toString()+"_SubcatFrame_").append(Integer.toString(SCFNumber)); SCFNumber++; return sb.toString(); } /** * This method generates a SyntacticBehaviour-ID */ private String getNewSBID() { StringBuffer sb = new StringBuffer(64); sb.append("OW_"+GlobalLanguageLMF.toString()+"_SyntacticBehaviour_").append(Integer.toString(SBNumber)); SBNumber++; return sb.toString(); } /** * This method returns the corresponding sense of a Lexeme */ public Sense getSense(SynTrans lexeme){ return lexemeSenseMappings.get(lexeme); } /** * Returns all Lexemes processed by SenseGenerator */ public Set<SynTrans> getProcessedLexemes(){ return lexemeSenseMappings.keySet(); } }