/** * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.tudarmstadt.ukp.lmf.transform.wordnet; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import net.sf.extjwnl.JWNLException; import net.sf.extjwnl.data.POS; import net.sf.extjwnl.data.Word; import net.sf.extjwnl.dictionary.Dictionary; import net.sf.extjwnl.dictionary.MorphologicalProcessor; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.lmf.model.core.Definition; import de.tudarmstadt.ukp.lmf.model.core.Statement; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; import de.tudarmstadt.ukp.lmf.model.enums.EStatementType; import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef; import de.tudarmstadt.ukp.lmf.model.semantics.Synset; import de.tudarmstadt.ukp.lmf.transform.wordnet.util.WNConvUtil; /** * Instance of this class offers methods for creating {@link Synset} instances * out of WordNet's data. */ public class SynsetGenerator { protected static class ExampleMapping { protected String senseKey; protected String lemma; protected int score; public ExampleMapping(final String senseKey, final String lemma) { this.senseKey = senseKey; this.lemma = lemma; } public String getSenseKey() { return senseKey; } public String getLemma() { return lemma; } public int getScore() { return score; } public void setScore(int score) { this.score = score; } public void addScore(final int increment) { this.score += increment; } } public final static String EXTERNAL_SYSTEM_SYNSET_OFFSET = "synsetOffset"; private final Log logger = LogFactory.getLog(getClass()); private final Dictionary wordnet; // WordNet Dictionary private MorphologicalProcessor morphProcessor; private final String resourceVersion; private boolean initialized = false; private final List<Synset> synsets = new ArrayList<Synset>(); private int lmfSynsetNumber = 0; // running number used for creating IDs of Synsets // Mappings betweenWordNet's synsets and Uby-LMF synsets private final Map<net.sf.extjwnl.data.Synset, Synset> wnSynsetLMFSynsetMappings = new HashMap<net.sf.extjwnl.data.Synset, Synset>(); // Mappings between lexemes and associated example sentences (extracted from WordNet's glosses) private final Map<String, List<String>> examples = new TreeMap<String, List<String>>(); protected List<String> annotationList; protected int[] annotationCounter = new int[10]; /** * This method constructs a {@link SynsetGenerator} based on the consumed parameters * @param wordnet initialized {@link Dictionary}-instance, used for accessing information encoded in WordNet's files * @param lexemeMappingFile the file containing manually entered mappings of example senteneces to lexemes * @param resourceVersion Version of the resource * @return SynsetGenerator */ public SynsetGenerator(final Dictionary wordnet, final String resourceVersion) { this.wordnet = wordnet; this.resourceVersion = resourceVersion; } /** Transforms WordNet synsets to UBY synsets and stores the result in * member variables. Initialization is done only once. */ public void initialize() throws JWNLException { if (initialized) { return; } // Create UBY-LMF synsets. for (POS pos : POS.getAllPOS()) { logger.info("processing " + pos.getLabel()); Iterator<net.sf.extjwnl.data.Synset> synIter = wordnet.getSynsetIterator(pos); while (synIter.hasNext()) { net.sf.extjwnl.data.Synset wnSynset = synIter.next(); // Synset. Synset lmfSynset = new Synset(); lmfSynset.setId("WN_Synset_" + lmfSynsetNumber); lmfSynsetNumber++; synsets.add(lmfSynset); wnSynsetLMFSynsetMappings.put(wnSynset, lmfSynset); // Definition. List<String> statementTexts = new ArrayList<String>(); String senseDefinition = processGloss(wnSynset, lmfSynset, statementTexts); if (senseDefinition != null && !senseDefinition.isEmpty()) { List<Definition> definitions = new LinkedList<Definition>(); Definition definition = new Definition(); definition.setTextRepresentations( WNConvUtil.makeTextRepresentationList(senseDefinition, ELanguageIdentifier.ENGLISH)); definitions.add(definition); lmfSynset.setDefinitions(definitions); // Statement. if (statementTexts.size() > 0) { List<Statement> statements = new ArrayList<Statement>(); for (String statementText : statementTexts) { Statement statement = new Statement(); statement.setStatementType(EStatementType.usageNote); statement.setTextRepresentations( WNConvUtil.makeTextRepresentationList(statementText, ELanguageIdentifier.ENGLISH)); statements.add(statement); } definition.setStatements(statements); } } // MonolingualExternalRef. MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef(); monolingualExternalRef.setExternalSystem(resourceVersion + "_" + EXTERNAL_SYSTEM_SYNSET_OFFSET); monolingualExternalRef.setExternalReference(wnSynset.getPOS() + " " + wnSynset.getOffset()); //TODO: implications? monolingualExternalRef.setExternalReference(wnSynset.getOffset() + "-" + wnSynset.getPOS().getKey()); List<MonolingualExternalRef> monolingualExternalRefs = new LinkedList<MonolingualExternalRef>(); monolingualExternalRefs.add(monolingualExternalRef); lmfSynset.setMonolingualExternalRefs(monolingualExternalRefs); } } // Write out missing annotations. /** / if (annotationList != null) { for (int i = 0; i < 10; i++) System.out.println(i + "\t" + annotationCounter[i]); try { logger.warn("Example disambiguation missing. Check annotations.txt"); PrintWriter writer = new PrintWriter("annotations.txt"); for (String annotLine : annotationList) writer.println(annotLine); writer.close(); } catch (FileNotFoundException e) { throw new RuntimeException(e); } } /**/ initialized = true; } protected String processGloss(final net.sf.extjwnl.data.Synset wnSynset, final Synset lmfSynset, final List<String> statements) throws JWNLException { // Split gloss into sense definition and sense examples. String gloss = wnSynset.getGloss(); String senseDefinition = ""; String senseExamples = null; boolean endsWithDelim = false; do { int idx = gloss.indexOf("\""); if (idx >= 0) { senseDefinition = senseDefinition + gloss.substring(0, idx); gloss = gloss.substring(idx + 1); senseExamples = gloss; } else { senseDefinition = senseDefinition + gloss + ";"; } String tmp = senseDefinition.trim(); endsWithDelim = (";:.,)".indexOf(tmp.charAt(tmp.length() - 1)) >= 0); if (!endsWithDelim) { senseDefinition = senseDefinition + "\""; } } while (!endsWithDelim); senseDefinition = senseDefinition.trim(); if (!senseDefinition.isEmpty()) { senseDefinition = senseDefinition.substring(0, senseDefinition.length() - 1).trim(); } // Separate sense examples. if (senseExamples != null) { int idx; do { idx = senseExamples.indexOf("\""); if (idx >= 0) { String senseExample = senseExamples.substring(0, idx); processExample(wnSynset, senseExample, statements); senseExamples = senseExamples.substring(idx + 1); idx = senseExamples.indexOf("\""); if (idx >= 0) { senseExamples = senseExamples.substring(idx + 1); } } } while (idx >= 0); } return senseDefinition; } protected String cleanText(final String text) { StringBuilder result = new StringBuilder(); boolean wasWhitespace = false; for (char c : text.toCharArray()) { if (" \t\n\r.,!?:;()`'-".indexOf(c) >= 0) { if (!wasWhitespace) { result.append(' '); } wasWhitespace = true; } else { result.append(Character.toLowerCase(c)); wasWhitespace = false; } } return result.toString().trim(); } protected void processExample(final net.sf.extjwnl.data.Synset wnSynset, final String senseExample, final List<String> statements) throws JWNLException { // Clean example and sense lemmas. String example = " " + cleanText(senseExample) + " "; List<ExampleMapping> mappings = new ArrayList<ExampleMapping>(); for (Word word : wnSynset.getWords()) { mappings.add(new ExampleMapping(word.getSenseKey(), cleanText(word.getLemma()))); } // Step 0: Check if there is a manual disambiguation. // String senseKey = manualDisambiguation.get(wnSynset.getOffset() + wnSynset.getPOS().getKey()); // if (senseKey != null) // saveExampleMapping(senseExample, senseKey); // Step 1: Check whether the lemma is a substring. boolean hasExactWordMatch = false; boolean hasPrefixMatch = false; for (ExampleMapping mapping : mappings) { String lemma = mapping.getLemma(); int idx = example.indexOf(" " + lemma + " "); if (idx >= 0) { // Found exact or prefix match. mapping.setScore(3); hasExactWordMatch = true; continue; } idx = example.indexOf(" " + lemma); if (idx >= 0) { mapping.setScore(2); hasPrefixMatch = true; continue; } // Check for prefix matches for the full list of lemma tokens. String regEx = lemma.replace(" ", "\\S*? ") + "\\S*?"; if (Pattern.compile(regEx).matcher(example).find()) { mapping.setScore(1); hasPrefixMatch = true; } /*boolean hasPrefixTokenMatch = true; List<String> lemmaTokens = segmentTokens(lemma); for (String lemmaToken : lemmaTokens) { if (example.indexOf(" " + lemmaToken) < 0) { hasPrefixTokenMatch = false; break; } } if (hasPrefixTokenMatch) { mapping.setScore(1); hasPrefixMatch = true; }*/ } if (hasExactWordMatch) { saveExampleMappings(senseExample, mappings, 3, true); annotationCounter[0]++; return; } annotationCounter[1]++; if (hasPrefixMatch) { saveExampleMappings(senseExample, mappings, 1, true); annotationCounter[2]++; return; } annotationCounter[3]++; // Step 2: Match single word lemmas with all base forms. Set<String> baseForms = makeBaseFormList(example); boolean hasBaseFormMatch = false; for (ExampleMapping mapping : mappings) { String lemma = mapping.getLemma(); if (baseForms.contains(lemma)) { mapping.addScore(1); hasBaseFormMatch = true; } } if (hasBaseFormMatch) { saveExampleMappings(senseExample, mappings, 1, true); annotationCounter[4]++; return; } annotationCounter[5]++; // Step 3: Match multi-word lemmas with all base forms. hasBaseFormMatch = false; for (ExampleMapping mapping : mappings) { String lemma = mapping.getLemma(); List<String> lemmaTokens = segmentTokens(lemma); boolean hasMultiWordBaseFormMatch = true; for (String lemmaToken : lemmaTokens) { if (!baseForms.contains(lemmaToken)) { hasMultiWordBaseFormMatch = false; break; } } if (hasMultiWordBaseFormMatch) { mapping.addScore(1); hasBaseFormMatch = true; } } if (hasBaseFormMatch) { saveExampleMappings(senseExample, mappings, 1, false); annotationCounter[6]++; return; } annotationCounter[7]++; // Step 4: Find the longest prefix matches of all lemma tokens. int maxScore1 = 0; int maxScore2 = 0; for (ExampleMapping mapping : mappings) { String lemma = mapping.getLemma(); List<String> lemmaTokens = segmentTokens(lemma); for (String lemmaToken : lemmaTokens) { // Trim each lemma token letter by letter and check for the longest // prefix match in the example sentence. int tokenLen = lemmaToken.length(); for (int i = 0; i < tokenLen - 2; i++) { String lemmaPrefix = " " + lemmaToken.substring(0, tokenLen - i); if (example.indexOf(lemmaPrefix) >= 0) { mapping.addScore(lemmaPrefix.length() - 1); break; } } } int score = mapping.getScore(); if (score >= maxScore1) { maxScore2 = maxScore1; maxScore1 = score; } else if (score >= maxScore2) { maxScore2 = score; } } if (maxScore1 > 0 && maxScore2 == 0) { saveExampleMappings(senseExample, mappings, maxScore1, false); annotationCounter[8]++; return; } annotationCounter[9]++; // Step 5: This example requires manual disambiguation. Add it to the // annotation list. if (annotationList == null) { annotationList = new ArrayList<String>(); } annotationList.add(wnSynset.getOffset() + wnSynset.getPOS().getKey() + "\t" + senseExample); for (Word word : wnSynset.getWords()) { annotationList.add("\t\t" + word.getSenseKey() + "\t" + word.getLemma()); } annotationList.add(""); // Step 6: If we still have no clue about the example, add it to the // statement class. statements.add(senseExample); } protected List<String> segmentTokens(String text) { List<String> result = new ArrayList<String>(); int idx; String remainingString = text; do { idx = remainingString.indexOf(' '); String token; if (idx >= 0) { token = remainingString.substring(0, idx); remainingString = remainingString.substring(idx + 1); } else { token = remainingString; } result.add(token); } while (idx >= 0); return result; } protected Set<String> makeBaseFormList(final String example) throws JWNLException { if (morphProcessor == null) { morphProcessor = wordnet.getMorphologicalProcessor(); } Set<String> result = new TreeSet<String>(); int idx; String remainingString = example; do { idx = remainingString.indexOf(' '); String token; if (idx >= 0) { token = remainingString.substring(0, idx); remainingString = remainingString.substring(idx + 1); } else { token = remainingString; } // Generate base forms for all POS to avoid POS tagging errors. if (!token.isEmpty()) { result.add(token); } for (POS pos : POS.values()) { result.addAll(morphProcessor.lookupAllBaseForms(pos, token)); } } while (idx >= 0); return result; } protected void saveExampleMappings(final String example, final List<ExampleMapping> mappings, final int minScore, final boolean preferLongerLemmas) { // Select all senses that scored at least the minimal score. List<ExampleMapping> selection = new ArrayList<ExampleMapping>(); for (ExampleMapping mapping : mappings) { if (mapping.getScore() >= minScore) { selection.add(mapping); } } // If there are ties, prefer the longer ones. if (preferLongerLemmas) { List<ExampleMapping> temp = new ArrayList<ExampleMapping>(); for (ExampleMapping mapping1 : selection) { String lemma1 = mapping1.getLemma(); boolean select = true; for (ExampleMapping mapping2 : selection) { String lemma2 = mapping2.getLemma(); if (lemma1.equals(lemma2)) { continue; } if (lemma2.contains(lemma1)) { select = false; break; } } if (select) { temp.add(mapping1); } } selection = temp; } // Save the selected example mappings. for (ExampleMapping mapping : selection) { saveExampleMapping(example, mapping.getSenseKey()); } } protected void saveExampleMapping(String example, String senseKey) { List<String> list = examples.get(senseKey); if (list == null) { list = new ArrayList<String>(); examples.put(senseKey, list); } list.add(example); } /** Returns the list of all UBY synsets generated by this generator. */ public List<Synset> getSynsets() { return synsets; } /** * This method consumes a WordNet's synset, and returns it's associated Uby-LMF synset, * generated by this generator.<br> * This method should be called after the generator has been initialized. * @param wnSynset WordNet's synset for which the generateed Uby-LMF synset should be returned * @return Uby-LMF synset associated with the consumed wnSynset * @see Synset * @see net.sf.extjwnl.data.Synset * @see SynsetGenerator#initialize() */ public Synset getLMFSynset(net.sf.extjwnl.data.Synset wnSynset){ return wnSynsetLMFSynsetMappings.get(wnSynset); } /** * This method returns all mappings between WordNet's synsets, and corresponding Uby-LMF synsets, * with WordNet's synsets as keys. * @return synset mappings created by this generator * @see Synset * @see net.sf.extjwnl.data.Synset */ Map<net.sf.extjwnl.data.Synset, Synset> getWNSynsetLMFSynsetMappings() { return wnSynsetLMFSynsetMappings; } /** * This method consumes a WordNet's lexeme and returns a list of lexeme's example-sentences, extracted by this generator<br> * from lexeme's synset. * @param lexeme a WordNet's lexeme which example sentences should be returned * @return lexeme's example sentences extracted by this generator * @see Word * @see net.sf.extjwnl.data.Synset */ public List<String> getExamples(Word lexeme){ try { return examples.get(lexeme.getSenseKey()); } catch (JWNLException e) { throw new IllegalArgumentException(e); } } }