/** * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.tudarmstadt.ukp.lmf.transform.germanet; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.lmf.model.enums.ECase; import de.tudarmstadt.ukp.lmf.model.enums.EComplementizer; import de.tudarmstadt.ukp.lmf.model.enums.EDeterminer; import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalFunction; import de.tudarmstadt.ukp.lmf.model.enums.EGrammaticalNumber; import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticCategory; import de.tudarmstadt.ukp.lmf.model.enums.ESyntacticProperty; import de.tudarmstadt.ukp.lmf.model.enums.ETense; import de.tudarmstadt.ukp.lmf.model.enums.EVerbForm; import de.tudarmstadt.ukp.lmf.model.semantics.SemanticArgument; import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate; import de.tudarmstadt.ukp.lmf.model.semantics.SynSemArgMap; import de.tudarmstadt.ukp.lmf.model.semantics.SynSemCorrespondence; import de.tudarmstadt.ukp.lmf.model.syntax.LexemeProperty; import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame; import de.tudarmstadt.ukp.lmf.model.syntax.SyntacticArgument; /** * This class extracts subcategorization frames of GermaNet 7.0 * using a custom subcategorization mapping file. */ public class SubcategorizationFrameExtractor { private final InputStream subcatStream; // The file containing subcatMappings private int subcatFrameNumber = 0; private int syntacticArgumentNumber = 0; // Running number for creating ID's of SyntacticArguments private final Map<String, SubcategorizationFrame> codeFrameMappings = new HashMap<String, SubcategorizationFrame>(); private final Map<String, SemanticPredicate> codePredMappings = new HashMap<String, SemanticPredicate>(); private final Map<String, String> codeSynSemArgMapping = new HashMap<String, String>(); private final Map<String, String> synSemArgSynArgMapping = new HashMap<String, String>(); private final Map<String, SubcategorizationFrame> synArgSubcatFrameMapping = new HashMap<String, SubcategorizationFrame>(); private final List<SemanticPredicate> semanticPredicates = new LinkedList<SemanticPredicate>(); // List of all SemanticPredicates private int semanticPredicateNumber = 0; private int semanticArgumentNumber = 0; private final List<SynSemCorrespondence> synSemCorrespondences = new LinkedList<SynSemCorrespondence>(); private int synSemCorrespondenceNumber = 0; // Running number for creating IDs // Mapping between LMF-Code of arguments (syntactic + semantic roles) and SynSemCorrespondence private final Map<String, SynSemCorrespondence> synsemargsSynSemCorrMap = new HashMap<String, SynSemCorrespondence>(); private final Log logger = LogFactory.getLog(getClass()); /** * Constructs a {@link SubcategorizationFrameExtractor} * @param subcatStream stream of the File containing the SubcategorizationFrame-mappings */ public SubcategorizationFrameExtractor(InputStream subcatStream){ this.subcatStream = subcatStream; parseSubcatMappings(); } /** * This method consumes a frame * and returns it's corresponding instance of {@link SubcategorizationFrame} class * @param frame a frame for which subcategorization frame should be returned * @return subcategorization frame associated with the consumed frame */ public SubcategorizationFrame getSubcategorizationFrame(String frame){ return codeFrameMappings.get(frame); } /** * This method consumes a frame string-representation * and returns it's corresponding instance of {@link SemanticPredicate} class * @param frame a frame for which semantic predicate should be returned * @return semantic predicate associated with the consumed frame */ public SemanticPredicate getSemanticPredicate(String frame){ return codePredMappings.get(frame); } /** * Returns a sorted list of all subcategorization frames contained in this extractor * @return all subcategorization frames in this extractor * @see SubcategorizationFrame * @see SubcategorizationFrameExtractor */ public List<SubcategorizationFrame> getSubcategorizationFrames(){ LinkedList<SubcategorizationFrame> result = new LinkedList<SubcategorizationFrame>(); result.addAll(synArgSubcatFrameMapping.values()); Collections.sort(result); return result; } /** * This method returns all semantic predicates generated by this extractor * @return a list of all semantic predicates generated by this extractor * @see SemanticPredicate * @see SubcategorizationFrameExtractor */ public List<SemanticPredicate> getSemanticPredicates(){ return semanticPredicates; } /** * This method returns all instances of {@link SynSemCorrespondence} class, generated by this extractor * @return syntactic-semantic correspondences generated by this extractor. */ public List<SynSemCorrespondence> getSynSemCorrespondences() { return synSemCorrespondences; } /** * This method parses the file containing the SubcategorizationFrame-mappings */ private void parseSubcatMappings(){ logger.info("Parsing subcategorization mappings..."); try { BufferedReader input = new BufferedReader(new InputStreamReader(subcatStream)); String line; while ((line = input.readLine()) != null) { if (!line.startsWith("#")) { parseLine(line); } } input.close(); } catch (IOException e) { throw new RuntimeException("Error on reading subcategorization mapping file", e); } logger.info("Parsing subcategorization mappings done"); // create SubcategorizationFrames Iterator<String> codeIterator = codeSynSemArgMapping.keySet().iterator(); while (codeIterator.hasNext()) { String code = codeIterator.next(); String synSemArgs = codeSynSemArgMapping.get(code); String synArgs = synSemArgSynArgMapping.get(synSemArgs); if (!synArgSubcatFrameMapping.containsKey(synArgs)) { SubcategorizationFrame subcategorizationFrame = new SubcategorizationFrame(); subcategorizationFrame.setId("GN_SubcategorizationFrame_".concat(Integer.toString(subcatFrameNumber))); subcategorizationFrame.setSubcatLabel(code); subcatFrameNumber++; subcategorizationFrame = parseArguments(synSemArgs,subcategorizationFrame); synArgSubcatFrameMapping.put(synArgs,subcategorizationFrame); codeFrameMappings.put(code, subcategorizationFrame); if (synSemArgs.contains("semanticRole")) { //only few GN-frames specify a semantic role SemanticPredicate semanticPredicate = new SemanticPredicate(); semanticPredicate = parseSemanticArguments(synSemArgs,subcategorizationFrame); codePredMappings.put(code, semanticPredicate); semanticPredicates.add(semanticPredicate); } } else { SubcategorizationFrame subcategorizationFrame = synArgSubcatFrameMapping.get(synArgs); codeFrameMappings.put(code, subcategorizationFrame); if (synSemArgs.contains("semanticRole")) { //only few GN-frames specify a semantic role SemanticPredicate semanticPredicate = new SemanticPredicate(); semanticPredicate = parseSemanticArguments(synSemArgs,subcategorizationFrame); codePredMappings.put(code, semanticPredicate); semanticPredicates.add(semanticPredicate); } } } } /** * This method parses a line of SubcatMappings-file * Line of SubcatMappings-file has the form: {@literal <CODE>%<Arg>:..:<Arg>} * @param line line of SubcatMappings-file */ private void parseLine(String line) { String [] parts = line.split("%"); codeSynSemArgMapping.put(parts[0], parts[1]); if (parts[1].contains("semanticRole")) { String synArgs = parts[1].replaceFirst(",semanticRole=[a-z]+", ""); synSemArgSynArgMapping.put(parts[1], synArgs); } else { synSemArgSynArgMapping.put(parts[1], parts[1]); } } /** * This method parses syntactic arguments encoded in a line of subcategorization mapping file * @param synSemArgs part of the line encoding the arguments * @param subcatFrame subcategorization frame to which syntactic arguments should be appended * @return subcategorization frame with appended syntactic arguments * @see SubcategorizationFrame * @see SyntacticArgument */ private SubcategorizationFrame parseArguments(String synSemArgs, SubcategorizationFrame subcatFrame) { SubcategorizationFrame scFrame = subcatFrame; List<SyntacticArgument> synArgs = new LinkedList<SyntacticArgument>(); String[] args = synSemArgs.split(":"); for(String arg : args) { if (!arg.contains("syntacticProperty")) { SyntacticArgument syntacticArgument = new SyntacticArgument(); syntacticArgument.setId("GN_SyntacticArgument_".concat(Integer.toString(syntacticArgumentNumber))); syntacticArgumentNumber++; String[] atts = arg.split(","); for(String att : atts){ String [] splits = att.split("="); String attName = splits[0]; if (attName.equals("grammaticalFunction")){ // needs some extra care because of incomplete names in the mappings-file... String gf=splits[1]; if(gf.endsWith("Comp")) { gf = gf.concat("lement"); } syntacticArgument.setGrammaticalFunction(EGrammaticalFunction.valueOf(gf)); } else if(attName.equals("syntacticCategory")) { syntacticArgument.setSyntacticCategory(ESyntacticCategory.valueOf(splits[1])); } else if(attName.equals("optional")) { syntacticArgument.setOptional(splits[1].equals("yes")); } else if(attName.equals("case")) { syntacticArgument.setCase(ECase.valueOf(splits[1])); } else if(attName.equals("determiner")) { syntacticArgument.setDeterminer(EDeterminer.valueOf(splits[1])); } else if(attName.equals("preposition")) { syntacticArgument.setPreposition(splits[1]); } else if(attName.equals("prepositionType")) { syntacticArgument.setPrepositionType(splits[1]); } else if(attName.equals("number")) { syntacticArgument.setNumber(EGrammaticalNumber.valueOf(splits[1])); } else if(attName.equals("lex")) { syntacticArgument.setLexeme(splits[1]); } else if(attName.equals("verbForm")) { syntacticArgument.setVerbForm(EVerbForm.valueOf(splits[1])); } else if(attName.equals("tense")) { syntacticArgument.setTense(ETense.valueOf(splits[1])); } else if(attName.equals("complementizer")) { syntacticArgument.setComplementizer(EComplementizer.valueOf(splits[1])); } } synArgs.add(syntacticArgument); } else { String [] splits = arg.split("="); String sp = splits[1]; if (sp.equals("raising")) { sp = sp.replaceAll("raising", "subjectRaising"); } LexemeProperty lexemeProperty = new LexemeProperty(); lexemeProperty.setSyntacticProperty(ESyntacticProperty.valueOf(sp)); scFrame.setLexemeProperty(lexemeProperty); } } scFrame.setSyntacticArguments(synArgs); return scFrame; } /** * This method consumes the part of the line of subcategorization mapping file encoding semantic arguments. <br> * It parses the arguments and returns an instance of {@link SemanticPredicate} class containing the arguments * @param synSemArgs part of the line encoding semantic arguments. * @param subcategorizationFrame instance of {@link SubcategorizationFrame} class used for creating * instances of {@link SynSemArgMap} class * @return semantic predicate containing parsed semantic arguments */ private SemanticPredicate parseSemanticArguments(String synSemArgs,SubcategorizationFrame subcategorizationFrame) { // list of mappings between Syntactic and Semantic Arguments is to be created SemanticPredicate semanticPredicate = new SemanticPredicate(); semanticPredicate.setId("GN_SemanticPredicate_".concat(Integer.toString(semanticPredicateNumber))); semanticPredicateNumber++; List<SemanticArgument> semanticArguments = new LinkedList<SemanticArgument>(); List<SynSemArgMap> synSemArgMaps = new LinkedList<SynSemArgMap>(); SynSemArgMap synSemArgMap = new SynSemArgMap(); String[] args = synSemArgs.split(":"); int index = 0; // iterate over syntactic Arguments for (SyntacticArgument synArg: subcategorizationFrame.getSyntacticArguments()) { String synsemArg = args[index]; if (synsemArg.contains("syntacticProperty")) { index++; synsemArg = args[index]; } // look at synsemArg: is semantic role defined? if yes: create corresponding semanticArg String[] atts = synsemArg.split(","); for(String att : atts){ String [] splits = att.split("="); String attName = splits[0]; if(attName.equals("semanticRole")){ SemanticArgument semanticArgument = new SemanticArgument(); semanticArgument.setId("GN_SemanticArgument_".concat(Integer.toString(semanticArgumentNumber))); semanticArgumentNumber++; semanticArgument.setSemanticRole(splits[1]); semanticArguments.add(semanticArgument); // Generate SynSemArgMapping synSemArgMap.setSyntacticArgument(synArg); synSemArgMap.setSemanticArgument(semanticArgument); synSemArgMaps.add(synSemArgMap); } } index++; } semanticPredicate.setSemanticArguments(semanticArguments); SynSemCorrespondence synSemCorrespondence = new SynSemCorrespondence(); synSemCorrespondence.setId("GN_SynSemCorrespondence_".concat(Integer.toString(synSemCorrespondenceNumber))); synSemCorrespondenceNumber++; synSemCorrespondence.setSynSemArgMaps(synSemArgMaps); synSemCorrespondences.add(synSemCorrespondence); synsemargsSynSemCorrMap.put(synSemArgs,synSemCorrespondence); return semanticPredicate; } }