/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.lmf.transform.omegawiki; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.Lexicon; import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech; import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation; import de.tudarmstadt.ukp.lmf.model.morphology.Lemma; import de.tudarmstadt.ukp.omegawiki.api.DefinedMeaning; import de.tudarmstadt.ukp.omegawiki.api.OWLanguage; import de.tudarmstadt.ukp.omegawiki.api.OmegaWiki; import de.tudarmstadt.ukp.omegawiki.api.SynTrans; import de.tudarmstadt.ukp.omegawiki.exception.OmegaWikiException; /** * This class generates LexicalEntries * @author Michael Matuschek * */ public class LexicalEntryGenerator { private int GlobalLanguage; private String GlobalLanguageLMF; private final OmegaWiki omegawiki; private final Lexicon lexicon; /* * LexemeGroups with same Lemma * corresponding LexicalEntry is also mapped */ private Map<Set<SynTrans>, LexicalEntry> lexemeGroupLexicalEntryMaping; /* * Mapping from OW to Uby POS */ private final HashMap<String, EPartOfSpeech> posMappings = new HashMap<String, EPartOfSpeech>(); private final HashMap<String,HashMap<String, HashSet<SynTrans>>> posLemmaLexemeGroup = new HashMap<String, HashMap<String, HashSet<SynTrans>>>(); private final Map<SynTrans, Set<SynTrans>> lexemeToGroupMappings = new HashMap<SynTrans, Set<SynTrans>> (); /* * All generated LexicalEntries */ private final List<LexicalEntry> lexicalEntries = new LinkedList<LexicalEntry>(); private int lexicalEntryNumber; private boolean initialized=false; // True when lexicalEntryGenerator is initialized private SenseGenerator senseGenerator; // SenseGenerator // private final String resourceVersion; /** * Constructs a LexicalEntryGenerator * based on consumed OmegaWiki Dictionary * @param omegawiki * @param synsetGenerator * @param lexicon * @param resourceVersion Version of the resource * @throws OmegaWikiException * @throws UnsupportedEncodingException */ public LexicalEntryGenerator(OmegaWiki omegawiki, SynsetGenerator synsetGenerator, Lexicon lexicon, String resourceVersion) throws UnsupportedEncodingException, OmegaWikiException { this.omegawiki = omegawiki; this.lexicon=lexicon; // this.resourceVersion = resourceVersion; if(!initialized){ this.GlobalLanguage=synsetGenerator.getGlobalLanguage(); this.GlobalLanguageLMF=synsetGenerator.getGlobalLanguageLMF(); lexicalEntryNumber = 0; groupLexemes(); // Put the POS mappings to map an EPartOfSpeech posMappings.put("noun", EPartOfSpeech.noun); posMappings.put("verb", EPartOfSpeech.verb); posMappings.put("adjective", EPartOfSpeech.adjective); posMappings.put("adverb", EPartOfSpeech.adverb); posMappings.put("personal pronoun", EPartOfSpeech.pronounPersonal); posMappings.put("indefinite article", EPartOfSpeech.determinerIndefinite); posMappings.put("demonstrative", EPartOfSpeech.pronoun); posMappings.put("definite article", EPartOfSpeech.determiner); posMappings.put("conjunction", EPartOfSpeech.conjunction); posMappings.put("cardinal number", EPartOfSpeech.numeral); posMappings.put("preposition", EPartOfSpeech.adpositionPreposition); posMappings.put("independent verb", EPartOfSpeech.verb); posMappings.put("determined cardinal", EPartOfSpeech.numeral); posMappings.put("contraction", EPartOfSpeech.particle); posMappings.put("pronoun", EPartOfSpeech.pronoun); posMappings.put("interjection", EPartOfSpeech.interjection); posMappings.put("article", EPartOfSpeech.determiner); posMappings.put("determiner", EPartOfSpeech.determiner); posMappings.put("subjunktion", EPartOfSpeech.conjunctionSubordinating); posMappings.put("name", EPartOfSpeech.nounProper); posMappings.put("transitive verb", EPartOfSpeech.verb); posMappings.put("numeral", EPartOfSpeech.numeral); posMappings.put("intransitive verb", EPartOfSpeech.verb); posMappings.put("interrogative pronoun", EPartOfSpeech.pronounInterrogative); posMappings.put("indefinite pronoun", EPartOfSpeech.pronoun); posMappings.put("impersonal verb", EPartOfSpeech.verb); posMappings.put("relative pronou", EPartOfSpeech.pronoun); posMappings.put("possessive pronoun", EPartOfSpeech.pronoun); posMappings.put("reflexive verb", EPartOfSpeech.verb); posMappings.put("prefix", EPartOfSpeech.particle); posMappings.put("suffix", EPartOfSpeech.particle); posMappings.put("exclaiming pronoun", EPartOfSpeech.pronoun); posMappings.put("Determinativpronomen", EPartOfSpeech.pronoun); posMappings.put("aanwijzend voornaamwoord", EPartOfSpeech.pronounDemonstrative); posMappings.put("adjective", EPartOfSpeech.adjective); posMappings.put("name", EPartOfSpeech.nounProper); posMappings.put("betrekkelijk voornaamwoord", EPartOfSpeech.pronounRelative); posMappings.put("koppelwerkwoord", EPartOfSpeech.verb); posMappings.put("noun", EPartOfSpeech.noun); posMappings.put("soortnaam", EPartOfSpeech.noun); posMappings.put("telwoord", EPartOfSpeech.numeral); posMappings.put("onbepaald lidwoord", EPartOfSpeech.determinerIndefinite); posMappings.put("article", EPartOfSpeech.determiner); posMappings.put("auxiliary verb", EPartOfSpeech.verbAuxiliary); posMappings.put("bezittelijk voornaamwoord", EPartOfSpeech.pronounPossessive); posMappings.put("subjunktion", EPartOfSpeech.conjunctionSubordinating); posMappings.put("tussenwerpsel", EPartOfSpeech.interjection); posMappings.put("bepaald lidwoord", EPartOfSpeech.determinerDefinite); posMappings.put("onbepaald voornaamwoord", EPartOfSpeech.pronounIndefinite); posMappings.put("voegwoord", EPartOfSpeech.conjunction); posMappings.put("bepaald hoofdtelwoord", EPartOfSpeech.numeral); posMappings.put("onbepaald hoofdtelwoord", EPartOfSpeech.numeral); posMappings.put("preposition", EPartOfSpeech.adpositionPreposition); posMappings.put("Ortsadverb", EPartOfSpeech.adverb); posMappings.put("onomatopoeia", EPartOfSpeech.interjection); posMappings.put("wederkerend voornaamwoord", EPartOfSpeech.pronounPersonalReflexive); posMappings.put("uitroepend voornaamwoord", EPartOfSpeech.pronoun); posMappings.put("intransitive verb", EPartOfSpeech.verb); posMappings.put("transitive verb", EPartOfSpeech.verb); posMappings.put("vragend voornaamwoord", EPartOfSpeech.pronounInterrogative); posMappings.put("wederkerig voornaamwoord", EPartOfSpeech.pronoun); posMappings.put("determiner", EPartOfSpeech.determiner); posMappings.put("Abstrakta", EPartOfSpeech.noun); // posMappings.put("ナ形容詞", EPartOfSpeech.symbol); // posMappings.put("adnominal", EPartOfSpeech.adjective); // posMappings.put("adposition", EPartOfSpeech.symbol); posMappings.put("Richtungsadverb", EPartOfSpeech.adverb); posMappings.put("adverb", EPartOfSpeech.adverb); posMappings.put("Kausaladverb", EPartOfSpeech.adverb); posMappings.put("verb", EPartOfSpeech.verb); posMappings.put("Sammelname", EPartOfSpeech.noun); posMappings.put("Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("disjunktives Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("kausales Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("konzessives Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("adversatives Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("konsekutives Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("kopulatives Konjunktionaladverb", EPartOfSpeech.adverb); posMappings.put("measure word", EPartOfSpeech.numeral); posMappings.put("demonstrative", EPartOfSpeech.determinerDemonstrative); posMappings.put("Ausdruckswort", EPartOfSpeech.interjection); posMappings.put("Modaladverb", EPartOfSpeech.adverb); // posMappings.put("終助詞", EPartOfSpeech.symbol); posMappings.put("formal noun", EPartOfSpeech.noun); posMappings.put("Grußwort", EPartOfSpeech.interjection); // posMappings.put("イ形容詞", EPartOfSpeech.symbol); posMappings.put("personal pronoun", EPartOfSpeech.pronounPersonal); posMappings.put("Interrogativadverb", EPartOfSpeech.adverb); posMappings.put("contraction", EPartOfSpeech.contraction); posMappings.put("Lokaladverb", EPartOfSpeech.adverb); posMappings.put("Stoffname", EPartOfSpeech.noun); // posMappings.put("動作名詞", EPartOfSpeech.symbol); posMappings.put("nominativ gebrauchtes Verb", EPartOfSpeech.noun); posMappings.put("Partikel der Bejahung oder Verneinung", EPartOfSpeech.particleAnswer); // posMappings.put("助詞", EPartOfSpeech.symbol); posMappings.put("pronoun", EPartOfSpeech.pronoun); posMappings.put("interrogative word", EPartOfSpeech.pronounInterrogative); posMappings.put("Satzadverb", EPartOfSpeech.adverb); posMappings.put("Temporaladverb", EPartOfSpeech.adverb); // posMappings.put("bacru", EPartOfSpeech.symbol); // posMappings.put("outrecuidant", EPartOfSpeech.symbol); arrogant posMappings.put("Determinativpronomen", EPartOfSpeech.pronoun); // posMappings.put("grammatical property", EPartOfSpeech.symbol); posMappings.put("Oronym", EPartOfSpeech.nounProper); posMappings.put("Nomen", EPartOfSpeech.noun); posMappings.put("coordonnant", EPartOfSpeech.conjunctionCoordinating); posMappings.put("subordinating conjunction", EPartOfSpeech.conjunctionSubordinating); posMappings.put("proword", EPartOfSpeech.pronoun); posMappings.put("proword for verb", EPartOfSpeech.pronoun); posMappings.put("common noun", EPartOfSpeech.nounCommon); // posMappings.put("stupid", EPartOfSpeech.symbol); senseGenerator = new SenseGenerator(synsetGenerator,resourceVersion); createLexicalEntries(); initialized = true; } } /** * This method groups all Lexemes (SynTranses) by lemma * * Optimization potential here with a dedicated method for retrieving SynTranses = Lexemes * @throws OmegaWikiException * @throws UnsupportedEncodingException, OmegaWikiException */ @SuppressWarnings("unchecked") private void groupLexemes() throws UnsupportedEncodingException, OmegaWikiException { int overall = 0; int current = 0; lexemeGroupLexicalEntryMaping= new HashMap<Set<SynTrans>, LexicalEntry>(); Iterator<DefinedMeaning> dmIter = null; // DefinedMeaning iterator HashMap<String, HashSet<SynTrans>>lemmaLexemeGroup = new HashMap<String, HashSet<SynTrans>>(); try { Set<DefinedMeaning> defMeanings = omegawiki.getAllDefinedMeanings(this.GlobalLanguage); dmIter = defMeanings.iterator(); overall = defMeanings.size(); } catch (Exception e) { e.printStackTrace(); } while(dmIter.hasNext()){ if(current++ % 1000 == 0) { System.out.println("Grouping lexemes... " + ((current * 100) / overall) + "%"); } DefinedMeaning dm = dmIter.next(); String pos=""; Set<SynTrans> lexemes = dm.getSynTranses(GlobalLanguage);// lexemes of the Synset = SynTranses in the desired language //Distinction by variants of English if(OWLanguage.English == GlobalLanguage) { for(SynTrans stuk : dm.getSynTranses(OWLanguage.English_United_Kingdom)) { @SuppressWarnings("rawtypes") HashSet toAdd = new HashSet(); boolean found = false; for (SynTrans orig : lexemes) { if(orig.getSyntrans().getSpelling().equals( stuk.getSyntrans().getSpelling())) { found = true; break; } if(!found) { toAdd.add(stuk); } } lexemes.addAll(toAdd); } for(SynTrans stus : dm.getSynTranses(OWLanguage.English_United_States)) { @SuppressWarnings("rawtypes") HashSet toAdd = new HashSet(); boolean found = false; for (SynTrans orig : lexemes) { if(orig.getSyntrans().getSpelling().equals( stus.getSyntrans().getSpelling())) { found = true; break; } if(!found) { toAdd.add(stus); } } lexemes.addAll(toAdd); } } //Handle unknown POS properly for(SynTrans lexeme : lexemes){ pos = (lexeme.getPOS()==null? "unknown": lexeme.getPOS().getValue()); if((lemmaLexemeGroup = posLemmaLexemeGroup.get(pos)) == null) { lemmaLexemeGroup = new HashMap<String, HashSet<SynTrans>>(); } HashSet<SynTrans> lexemeGroup; // Group of Lexemes with the same lemma String lemma = lexeme.getSyntrans().getSpelling(); // Lemma's Lexeme // if((lexemeGroup = lemmaLexemeGroup.get(lemma)) == null){ lexemeGroup = new HashSet<SynTrans>(); lemmaLexemeGroup.put(lemma, lexemeGroup); } lexemeGroup.add(lexeme); lexemeToGroupMappings.put(lexeme, lexemeGroup); posLemmaLexemeGroup.put(pos, lemmaLexemeGroup); } } System.out.println("Grouping lexemes... done"); } /** * This method iterates over All lexemeGroups and * creates a List of LexicalEntries * @throws OmegaWikiException * @throws UnsupportedEncodingException */ private void createLexicalEntries() throws UnsupportedEncodingException, OmegaWikiException{ System.out.println("Transforming lexeme groups... 0%"); for (Entry<String, HashMap<String, HashSet<SynTrans>>> pos : posLemmaLexemeGroup.entrySet()) { int current = 0; int overall = pos.getValue().size(); for(Entry<String, HashSet<SynTrans>> lemmaSet : pos.getValue().entrySet()) { Set<SynTrans> sts = lemmaSet.getValue(); if(current++ % 1000 == 0) System.out.println("Transforming lexeme groups " + pos.getKey() + "... " + ((current * 100) / overall) + "%"); LexicalEntry lexicalEntry = createLexicalEntry(sts, pos.getKey(), lemmaSet.getKey(), lexicon); lexicalEntries.add(lexicalEntry); lexemeGroupLexicalEntryMaping.put(sts, lexicalEntry); } } System.out.println("Transforming lexeme groups... done"); } /** * This method consumes a lexemeGroup and returns * the corresponding LexicalEntry * @param lexemeGroup * @param pos * @param ow_lemma * @param lexicon * @return LexicalEntry that corresponds to lexemeGroup * @throws OmegaWikiException * @throws UnsupportedEncodingException */ private LexicalEntry createLexicalEntry(Set<SynTrans> lexemeGroup, String pos, String ow_lemma,Lexicon lexicon) throws UnsupportedEncodingException, OmegaWikiException { // Create a new LexicalEntry for this group of lexemes LexicalEntry lexicalEntry = new LexicalEntry(); lexicalEntry.setLexicon(lexicon); // Create ID for this lexicalEntry lexicalEntry.setId(createID()); lexicalEntry.setPartOfSpeech(getPOS(pos)); //*** Creating Lemma ***// Lemma lemma = new Lemma(); List<FormRepresentation> formRepresentations = new LinkedList<FormRepresentation>(); FormRepresentation formRepresentation = new FormRepresentation(); formRepresentation.setLanguageIdentifier(OmegaWikiLMFMap.mapLanguage(GlobalLanguage)); formRepresentation.setWrittenForm(ow_lemma); formRepresentations.add(formRepresentation); for(SynTrans st : lexemeGroup) { if(st.getSyntrans().getLanguageId()!=GlobalLanguage) { formRepresentation.setGeographicalVariant(OWLanguage.getName(st.getSyntrans().getLanguageId())); } } lemma.setFormRepresentations(formRepresentations); lexicalEntry.setLemma(lemma); //*** Creating Senses ***// lexicalEntry.setSenses(senseGenerator.generateSenses(lexemeGroup,lexicalEntry)); lexicalEntry.getSenses(); return lexicalEntry; } /** * This method return the Uby Pos for a OW pos * @param pos * @return Uby Pos */ private EPartOfSpeech getPOS(String pos) { EPartOfSpeech result = posMappings.get(pos); return result; } /** * This method creates an ID for a LexicalEntry * @return ID for lexicalEntry */ private String createID() { StringBuffer sb = new StringBuffer(32); sb.append("OW_"+GlobalLanguageLMF+"_LexicalEntry_").append(lexicalEntryNumber++); return sb.toString(); } /** * @return the lexicalEntries */ public List<LexicalEntry> getLexicalEntries() { return lexicalEntries; } /** * Returns a LexicalEntry that corresponds to the consumed lexemeGroup * @param lexemeGroup * @return the LexicalEntry that corresponds to the consumed lexemeGroup */ public LexicalEntry getLexicalEntry(Set<SynTrans> lexemeGroup){ return lexemeGroupLexicalEntryMaping.get(lexemeGroup); } /** * @return the lexemeGroupLexicalEntryMaping */ public Map<Set<SynTrans>, LexicalEntry> getLexemeGroupLexicalEntryMaping() { return lexemeGroupLexicalEntryMaping; } /** * @return the senseGenerator */ public SenseGenerator getSenseGenerator() { return senseGenerator; } /** * Returns the lexemeGroup of the consumed lexeme */ public Set<SynTrans> getGroup(SynTrans lexeme){ return lexemeToGroupMappings.get(lexeme); } }