/** * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.tudarmstadt.ukp.lmf.transform.framenet; import java.io.File; import java.util.ArrayList; import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import java.util.TreeMap; import java.util.logging.Logger; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.saar.coli.salsa.reiter.framenet.FrameNet; import de.saar.coli.salsa.reiter.framenet.Lexeme; import de.saar.coli.salsa.reiter.framenet.LexicalUnit; import de.saar.coli.salsa.reiter.framenet.PartOfSpeech; import de.saar.coli.salsa.reiter.framenet.SemanticType; import de.saar.coli.salsa.reiter.framenet.SemanticTypeNotFoundException; import de.saar.coli.salsa.reiter.framenet.fncorpus.AnnotatedLexicalUnit; import de.saar.coli.salsa.reiter.framenet.fncorpus.AnnotationCorpus; import de.saar.coli.salsa.reiter.framenet.fncorpus.AnnotationCorpus15; import de.saar.coli.salsa.reiter.framenet.fncorpus.Sentence; import de.tudarmstadt.ukp.lmf.model.core.Definition; import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.core.TextRepresentation; import de.tudarmstadt.ukp.lmf.model.enums.EExampleType; import de.tudarmstadt.ukp.lmf.model.enums.ELabelTypeSemantics; import de.tudarmstadt.ukp.lmf.model.enums.ELanguageIdentifier; import de.tudarmstadt.ukp.lmf.model.enums.EPartOfSpeech; import de.tudarmstadt.ukp.lmf.model.meta.Frequency; import de.tudarmstadt.ukp.lmf.model.meta.SemanticLabel; import de.tudarmstadt.ukp.lmf.model.morphology.Component; import de.tudarmstadt.ukp.lmf.model.morphology.FormRepresentation; import de.tudarmstadt.ukp.lmf.model.morphology.Lemma; import de.tudarmstadt.ukp.lmf.model.morphology.ListOfComponents; import de.tudarmstadt.ukp.lmf.model.semantics.MonolingualExternalRef; import de.tudarmstadt.ukp.lmf.model.semantics.PredicativeRepresentation; import de.tudarmstadt.ukp.lmf.model.semantics.SemanticArgument; import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate; import de.tudarmstadt.ukp.lmf.model.semantics.SenseExample; /** * Instance of this class offers methods for creating {@link LexicalEntry} out of FrameNet's data * @author Zijad Maksuti, Silvana Hartmann * */ public class LexicalEntryGenerator { public static final String LEXICAL_UNIT = "lexicalUnit"; public static final String SEMANTIC_TYPE = "semanticType"; private final String resourceVersion; private final SemanticPredicateGenerator semanticPredicateGenerator; private int lexicalEntryNumber; // Running number used for creating IDs of LexicalEntries private int senseNumber; // Running number used for creating IDs of Senses private int senseExampleNumber; // Running number used for creating IDs of SenseExamples private final FrameNet fn; // FrameNet object, used for obtaining needed informations out of FrameNet's files // all LexicalEntries produced by this LexicalEntryGenerator private final List<LexicalEntry> lexicalEntries = new LinkedList<LexicalEntry>(); /* * Groups of LexicalUnits with equal lemma, * divided by PartOfSpeech */ private Map<PartOfSpeech, Map<String, Set<LexicalUnit>>> mappings; // Mapping between luGroups and corresponding LexicalEntries private final Map<Set<LexicalUnit>, LexicalEntry> groupLEMappings = new LinkedHashMap<Set<LexicalUnit>, LexicalEntry>(); // Mappings used for creating targetLexicalEntry attribute in Component class private final Map<PartOfSpeech, Map<String, List<Component>>> components = new LinkedHashMap<PartOfSpeech, Map<String,List<Component>>>(); // all Components /* * This mapping contains all "dummy" LexicalEntries, * that are created in order to set the targetLexicalEntry attribute in Component class */ private final Map<PartOfSpeech, Map<String, LexicalEntry>> dummyLEs = new LinkedHashMap<PartOfSpeech, Map<String, LexicalEntry>>(); // used for extracting annotations private AnnotationCorpus ac; // directory where FrameNet's files are located private final String fnhome; private final Log logger = LogFactory.getLog(getClass()); /** * Constructs an instance of LexicalEntryGenerator, which provides methods for creating * {@link LexicalEntry LexicalEntries} out of FrameNet's files * * @param fn * instance of {@link FrameNet} class, used for obtaining needed informations for * generating LexicalEntries * @param semanticPredicateGenerator * instance of {@link SemanticPredicateGenerator} used for creating * {@link SemanticPredicate SemanticPredicates} * @param resourceVersion * Version of the resource */ public LexicalEntryGenerator(FrameNet fn, SemanticPredicateGenerator semanticPredicateGenerator, String resourceVersion){ this.fn = fn; this.semanticPredicateGenerator = semanticPredicateGenerator; this.resourceVersion = resourceVersion; // ac = new AnnotationCorpus15(fn, logger); fnhome = System.getenv("UBY_HOME")+"/FrameNet/fndata-1.5"; System.err.println("LUS group to"); groupLUs(); System.err.println("LUS grouped"); // Initialize help-mappings for(PartOfSpeech pos : PartOfSpeech.values()){ dummyLEs.put(pos, new TreeMap<String, LexicalEntry>()); components.put(pos, new TreeMap<String, List<Component>>()); } System.err.println("help mappings initialized"); createLexicalEntries(); System.err.println("LEs created"); updateComponents(); System.err.println("Compontents updated"); } /** * This method iterates over all groups of LexicalEntries and creates a {@link LexicalEntry} for * every group of {@link Lexeme Lexemes}. */ private void createLexicalEntries(){ for(PartOfSpeech pos : mappings.keySet()) { for (Set<LexicalUnit> luGroup : mappings.get(pos).values()){ LexicalEntry lexicalEntry = createLexicaltEntry(luGroup); groupLEMappings.put(luGroup, lexicalEntry); lexicalEntries.add(lexicalEntry); } } } /** * This method creates a {@link LexicalEntry} based on the consumed group of {@link LexicalUnit * LexicalUnits} * * @param luGroup * a group LexicalUnits with equal lemma and part of speech * @return generated LexicalEntry based on consumed luGroup */ public LexicalEntry createLexicaltEntry(Set<LexicalUnit> luGroup){ LexicalEntry lexicalEntry = new LexicalEntry(); lexicalEntry.setId(createID()); PartOfSpeech pos=null; String lemmaString = null; ListOfComponents listOfComponents = null; List<Sense> senses = new ArrayList<Sense>(); for(LexicalUnit lu : luGroup){ if(pos == null) { pos = lu.getPartOfSpeech(); } // if(lemmaString == null) { // ; // } // lemmaString = lu.getLexemeString(); //wrong order for some mwes from API // workaround: List<Lexeme> lexemeList = lu.getLexemes(); // get all units int lexemeCount = lexemeList.size(); if (lexemeCount>1){ //if multiword lemma String[] ordered = new String[lexemeCount]; for (Lexeme lex:lexemeList){ ordered[lex.getOrder()-1] = lex.getValue(); } lemmaString = StringUtils.join(ordered, " "); } else { // unigram lemma lemmaString = lu.getLexemeString(); } // CREATING SENSE FOR THE LU Sense sense = new Sense(); // setting id StringBuffer sb = new StringBuffer(32); sb.append("FN_Sense_").append(senseNumber++); sense.setId(sb.toString()); sense.setIndex(senses.size() + 1); // setting Definition Definition definition = new Definition(); TextRepresentation textRepresentation = new TextRepresentation(); textRepresentation.setLanguageIdentifier(ELanguageIdentifier.ENGLISH); textRepresentation.setWrittenText(FNUtils.filterTags(lu.getDefinition())); List<TextRepresentation> textRepresentations = new ArrayList<TextRepresentation>(); textRepresentations.add(textRepresentation); definition.setTextRepresentations(textRepresentations); List<Definition> definitions = new ArrayList<Definition>(); definitions.add(definition); sense.setDefinitions(definitions); // setting MonolingualExternalRef MonolingualExternalRef monolingualExternalRef = new MonolingualExternalRef(); monolingualExternalRef.setExternalSystem(resourceVersion + "_" + LEXICAL_UNIT); monolingualExternalRef.setExternalReference(lu.getId()); List<MonolingualExternalRef> monolingualExternalRefs = new ArrayList<MonolingualExternalRef>(); monolingualExternalRefs.add(monolingualExternalRef); sense.setMonolingualExternalRefs(monolingualExternalRefs); // setting PredicativeRepresentation SemanticPredicate semanticPredicate = semanticPredicateGenerator.getSemanticPredicate(lu.getFrame()); if(semanticPredicate == null){ StringBuffer sbErr = new StringBuffer(64); sbErr.append("LexicalEntryGenerator: SemanticPredicateGenerator did not provide SemanticPredicate of Frame: "); sbErr.append(lu.getFrame()); sbErr.append('\n'); sbErr.append("Aborting all operations!"); logger.error(sbErr.toString()); System.exit(1); } PredicativeRepresentation predicativeRepresentation = new PredicativeRepresentation(); predicativeRepresentation.setPredicate(semanticPredicate); List<PredicativeRepresentation> predicativeRepresentations = new ArrayList<PredicativeRepresentation>(); predicativeRepresentations.add(predicativeRepresentation); sense.setPredicativeRepresentations(predicativeRepresentations); // SETTING incorporatedSemArg // Parse the corpus in order to get more information about the lu ac = new AnnotationCorpus15(fn, Logger.getLogger(getClass().getName()));//works ac.parse(new File(fnhome+File.separator+"lu"), "lu"+lu.getId()+".xml"); AnnotatedLexicalUnit alu = ac.getAnnotation(lu); if(alu != null){ // incorporatedSemArg can only be set for annotated lus String incorporatedFEName = alu.getIncorporatedFE(); if(incorporatedFEName != null){ // Add inCorporated=true information to SemanticArguments List<SemanticArgument> semanticArguments = semanticPredicate.getSemanticArguments(); SemanticArgument incorporatedArgument = null; if(semanticArguments == null){ // no args => create a new one //not needed semanticArguments = new ArrayList<SemanticArgument>(); } if (semanticArguments.size()==0){ incorporatedArgument = semanticPredicateGenerator.createIncorporatedSemanticArgument(incorporatedFEName); semanticArguments.add(incorporatedArgument); } else { // change the isIncorporated Flag of the corresponding argument for (SemanticArgument semanticArgument: semanticArguments){ if (semanticArgument.getSemanticRole().equals(incorporatedFEName)){ incorporatedArgument = semanticArgument; incorporatedArgument.setIncorporated(true); } } } semanticPredicate.setSemanticArguments(semanticArguments); if (incorporatedArgument != null){ sense.setIncorporatedSemArg(incorporatedArgument); } } // MAPPING LU'S SEMTYPE TO SENSE NEW Set<String> semTypes = alu.getSemTypes(); for (String s: semTypes){ SemanticType t = null; try { t = fn.getSemanticType(s); } catch (SemanticTypeNotFoundException e) { logger.warn("Did not find semantic type in FN: " + s); } if (s.matches("^[0-9]+")){// ID System.err.println("ID: " + s); } else { // filter different types if (s.equalsIgnoreCase("Transparent Noun")|| s.equalsIgnoreCase("9")){ // no semantic label sense.setTransparentMeaning(true); } else {// semantic label format sense.setTransparentMeaning(false); SemanticLabel semanticLabel = new SemanticLabel(); if (s.equalsIgnoreCase("Negative_judgment")||s.equalsIgnoreCase("Positive_judgment")){ //type semanticLabel.setType(ELabelTypeSemantics.sentiment); semanticLabel.setLabel(t.getName()); } else if (s.equalsIgnoreCase("Bound_LU") || s.equalsIgnoreCase("Bound_dependent_LU") || s.equalsIgnoreCase("Support")||s.equals("223")){ semanticLabel.setType(ELabelTypeSemantics.collocate); semanticLabel.setLabel(t.getName()); } else if (s.equalsIgnoreCase("Biframal_LU")){ //Does not occur semanticLabel.setType(ELabelTypeSemantics.resourceSpecific); semanticLabel.setLabel(t.getName()); } else if (s.equalsIgnoreCase("Tendency_Grading_LU")){ semanticLabel.setType(ELabelTypeSemantics.resourceSpecific); semanticLabel.setLabel(t.getName()); } else { // this should be ontological types semanticLabel.setType(ELabelTypeSemantics.semanticCategory); semanticLabel.setLabel(t.getName()); } // for all semantic labels // creating MonolingualExternalRef for SemanticLabel List<MonolingualExternalRef> merefs = new LinkedList<MonolingualExternalRef>(); MonolingualExternalRef meref = new MonolingualExternalRef(); meref.setExternalReference(s); meref.setExternalSystem(resourceVersion + "_" + SEMANTIC_TYPE); merefs.add(meref); semanticLabel.setMonolingualExternalRefs(monolingualExternalRefs); List<SemanticLabel> semanticLabels = sense.getSemanticLabels(); if(semanticLabels == null) { semanticLabels = new ArrayList<SemanticLabel>(); } semanticLabels.add(semanticLabel); sense.setSemanticLabels(semanticLabels); } } } List<Lexeme> lexemes = lu.getLexemes(); // Creating a list of components for multiword LexicalUnits if(lexemes.size() > 1) { listOfComponents = createListOfComponents(lexemes); } List<SenseExample> senseExamples = sense.getSenseExamples(); if(senseExamples == null) { senseExamples = new ArrayList<SenseExample>(); } sense.setSenseExamples(senseExamples); // GETTING ANNOTATION SENTENCES for(Sentence sentence : alu.getSentences()){ // Creating a SenseExample for every Sentence SenseExample senseExample = new SenseExample(); StringBuffer sexID = new StringBuffer(32); sexID.append("FN_SenseExample_").append(senseExampleNumber++); senseExample.setId(sexID.toString()); TextRepresentation sexTR = new TextRepresentation(); sexTR.setLanguageIdentifier(ELanguageIdentifier.ENGLISH); sexTR.setWrittenText(FNUtils.filterTags(sentence.getText())); List<TextRepresentation> sexTRs = new ArrayList<TextRepresentation>(); sexTRs.add(sexTR); senseExample.setTextRepresentations(sexTRs); senseExample.setExampleType(EExampleType.senseInstance); senseExamples.add(senseExample); } } // ##### CREATING FREQUENCY List<Frequency> frequencies = sense.getFrequencies(); if(frequencies == null) { frequencies = new ArrayList<Frequency>(); } // for annotated instances Frequency frequency = new Frequency(); frequency.setFrequency(lu.getSentCountAnnotated()); frequency.setGenerator("annotated_instances"); frequencies.add(frequency); // total Frequency freqTotal = new Frequency(); freqTotal.setFrequency(lu.getSentCountTotal()); freqTotal.setGenerator("all_instances"); frequencies.add(freqTotal); sense.setFrequencies(frequencies); senses.add(sense); } lexicalEntry.setSenses(senses); //Setting POS EPartOfSpeech epos = FNUtils.getPOS(pos); if(epos == null) { posNotFound(pos); } lexicalEntry.setPartOfSpeech(epos); // Creting a lemma Lemma lemma = new Lemma(); FormRepresentation formRepresentation = new FormRepresentation(); formRepresentation.setLanguageIdentifier(ELanguageIdentifier.ENGLISH); formRepresentation.setWrittenForm(lemmaString); List<FormRepresentation> formRepresentations = new LinkedList<FormRepresentation>(); formRepresentations.add(formRepresentation); lemma.setFormRepresentations(formRepresentations); lexicalEntry.setLemma(lemma); LexicalEntry control = groupLEMappings.put(luGroup, lexicalEntry); if(control != null){ StringBuffer sb = new StringBuffer(128); sb.append("LexicalEntryGenerator: Mapping for luGroup: "); sb.append(luGroup); sb.append(" already exists!"); sb.append("\n"); sb.append("Aborting all operations!"); logger.error(sb.toString()); System.exit(1); } // Setting listOfComponents if(listOfComponents != null) { lexicalEntry.setListOfComponents(listOfComponents); } return lexicalEntry; } /** * This method consumes a list of {@link Lexeme Lexemes} and creates a {@link ListOfComponents}. * The {@link Component Components} do NOT have the targetLexicalEntry attribute set! * * @param lexemes * the list of Lexemes from which a ListOfComponents should be generated * @return ListOfComponents based on consumed lexemes */ private ListOfComponents createListOfComponents(List<Lexeme> lexemes) { ListOfComponents listOfComponents = new ListOfComponents(); List<Component> components = new LinkedList<Component>(); for(Lexeme lexeme : lexemes){ // Create a Component for every lexeme Component component = new Component(); component.setHead(lexeme.isHeadword()); component.setBreakBefore(lexeme.isBreakBefore()); component.setPosition(lexeme.getOrder()); PartOfSpeech pos = lexeme.getPartOfSpeech(); components.add(component); String name = lexeme.getValue(); // Record for creation of targetLexicalEntry attribute later Map<String, List<Component>> mapping = this.components.get(pos); List<Component> cmps = mapping.get(name); if(cmps == null) { cmps = new LinkedList<Component>(); } cmps.add(component); mapping.put(name, cmps); } listOfComponents.setComponents(components); return listOfComponents; } /** * This method creates an ID for a {@link LexicalEntry} * * @return ID for an instance of LexicalEntry */ private String createID() { StringBuffer sb = new StringBuffer(32); sb.append("FN_LexicalEntry_").append(lexicalEntryNumber++); return sb.toString(); } /** * This method iterates over all created {@link Component Components} and updates their * {@link Component#getTargetLexicalEntry() targetLexicalEntry} attribute */ private void updateComponents() { for(PartOfSpeech pos : components.keySet()) { for(String lemma : components.get(pos).keySet()) { for(Component component : components.get(pos).get(lemma)){ Set<LexicalUnit> luGroup = mappings.get(pos).get(lemma); LexicalEntry lexicalEntry = null; if(luGroup != null){ lexicalEntry = groupLEMappings.get(luGroup); if(lexicalEntry == null){ StringBuffer sb = new StringBuffer(256); sb.append("LexicalEntryGenerator: Error on updating Components!"); sb.append("No lexical entry for luGroup: ").append(luGroup).append(" found"); sb.append('\n').append("Aborting all operations!"); logger.error(sb.toString()); System.exit(1); } } else { // when no luGroup with this lemma has been found // check if a dummy LexicalEntry exists lexicalEntry = dummyLEs.get(pos).get(lemma); } if(lexicalEntry != null){ // component has a corresponding LexicalEntry component.setTargetLexicalEntry(lexicalEntry); } else{ // component does not have a corresponding LexicalEntry // a new LexicalEntry will be created lexicalEntry = new LexicalEntry(); EPartOfSpeech epos = FNUtils.getPOS(pos); if(epos == null) { posNotFound(pos); } lexicalEntry.setPartOfSpeech(epos); lexicalEntry.setId(createID()); List<FormRepresentation> formRepresentations = new ArrayList<FormRepresentation>(); FormRepresentation formRepresentation = new FormRepresentation(); formRepresentation.setLanguageIdentifier(ELanguageIdentifier.ENGLISH); formRepresentation.setWrittenForm(lemma); formRepresentations.add(formRepresentation); Lemma lemmaObj = new Lemma(); lemmaObj.setLexicalEntry(lexicalEntry); lemmaObj.setFormRepresentations(formRepresentations); component.setTargetLexicalEntry(lexicalEntry); lexicalEntry.setLemma(lemmaObj); // Add a record for future dummyLEs.get(pos).put(lemma, lexicalEntry); lexicalEntries.add(lexicalEntry); } } } } } /** * This method groups all LexicalUnits by lemma and part of speech */ private void groupLUs() { mappings = new LinkedHashMap<PartOfSpeech, Map<String, Set<LexicalUnit>>>(); PartOfSpeech[] poses = PartOfSpeech.values(); for(PartOfSpeech pos : poses) { mappings.put(pos, new TreeMap<String, Set<LexicalUnit>>()); } for(LexicalUnit lu : fn.getLexicalUnits()){ Map<String, Set<LexicalUnit>> lemmaLUMappings = mappings.get(lu.getPartOfSpeech()); String lemma = lu.getLexemeString(); // lu's lemma // Appending partOfSpeech of components for multiword expressions List<Lexeme> lexemes = lu.getLexemes(); if(lexemes.size() > 1) { for(Lexeme lexeme : lexemes){ /* * POS, isBreakBefor, isHeadWord and Order are relevant when grouping * multiword LexicalUnits */ lemma = lemma.concat(lexeme.getPos()); lemma = lemma.concat("isBreakBefore:").concat(Boolean.toString(lexeme.isBreakBefore())); lemma = lemma.concat("isHeadWord:").concat(Boolean.toString(lexeme.isHeadword())); lemma = lemma.concat("order:").concat(Integer.toString(lexeme.getOrder())); } } Set<LexicalUnit> luGroup = lemmaLUMappings.get(lemma); if(luGroup == null) { luGroup = new LinkedHashSet<LexicalUnit>(); } luGroup.add(lu); lemmaLUMappings.put(lemma, luGroup); } } /** * This method is called when an associated part of speech, defined in {@link EPartOfSpeech}, * could not be found for part of speech defined in {@link PartOfSpeech}. * It informs the user about the situation and terminates the running process * * @param pos * part of speech defined in {@link PartOfSpeech}, for which an associated part of * speech in {@link EPartOfSpeech} could not be found */ private void posNotFound(PartOfSpeech pos){ StringBuffer sb = new StringBuffer(128); sb.append("LexicalEntryGenerator: FNUtils returned null for PartOfSpeech: ").append(pos) .append(" Aborting all operations."); logger.error(sb.toString()); System.exit(1); } /** * Returns all {@link LexicalEntry LexicalEntries} generated by this * {@link LexicalEntryGenerator}. * * @return the lexicalEntries */ public List<LexicalEntry> getLexicalEntries() { return lexicalEntries; } }