/** * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package de.tudarmstadt.ukp.lmf.transform.germanet; import java.io.File; import java.util.HashMap; import java.util.List; import java.util.Map; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.dom4j.Document; import org.dom4j.DocumentException; import org.dom4j.Element; import org.dom4j.io.SAXReader; import de.tuebingen.uni.sfs.germanet.api.GermaNet; import de.tuebingen.uni.sfs.germanet.api.LexUnit; import de.tuebingen.uni.sfs.germanet.api.Synset; /** * This class offers methods for extraction of semantic class labels * of {@link LexUnit} and {@link Synset} instances. <br> * Instance of this class parses the names of <a href="URL#http://www.sfs.uni-tuebingen.de/lsd/index.shtml">GermaNet 7.0</a> * files (semantic labels) and determines the LexUnits and Synsets contained in the file for which the semantic label applies. * @author Zijad Maksuti * @author Judith Eckle-Kohler * */ public class SemanticClassLabelExtractor { private final File gnData; // directory containing the GermaNet files private final Map<Integer, String> luMappings = new HashMap<Integer, String>(); // luID <-> filename mappings private final Map<Integer, String> synsetMappings = new HashMap<Integer, String>(); // synsetID <-> filename mappings private final Log logger = LogFactory.getLog(getClass()); /** * Constructs an instance of {@link SemanticClassLabelExtractor} * * @param gn initialized {@link GermaNet} object used to access GermaNet's information * */ public SemanticClassLabelExtractor(GermaNet gn) { this.gnData = new File(gn.getDir()); this.initialize(); } /** * This method consumes an instance of {@link LexUnit}, and returns it's semantic class label * @param lu a LexUnit for which semantic class label should be extracted * @return lu's semantic class label or null if the extractor contains no mapping for the lu's id */ public String getLUSemanticClassLabel(LexUnit lu){ String result = luMappings.get(lu.getId()); if(result != null) { return result.split("\\.")[1]; // extract the semantic class label } return result; } /** * This class consumes an instance of {@link Synset}, and returns it's semantic class label * @param synset a Synset for which semantic class label should be extracted * @return synset's semantic class label or null if the extractor contains no mapping for the synset's ID */ public String getSynsetSemanticClassLabel(Synset synset){ int synsetID=synset.getId(); String result = synsetMappings.get(synsetID); if(result != null) { return result.split("\\.")[1]; // extract the SemanticClasLabel } return result; } /** * This method iterates over GermaNet's files and extracts semantic class labels <br> * of Synsets and LexUnits * @see LexUnit * @see Synset */ private void initialize() { if(luMappings.isEmpty()){ logger.info("Initializing SemanticClassLabelExtractor... "); String[] fileNames = gnData.list(); // Names of all files in GermaNet's directory for (String fileName : fileNames) { // If a file starts with "adj.", "nomen." or "verben." and ends with ".xml" it should be examined if ((fileName.startsWith("adj.") || fileName.startsWith("nomen.") || fileName.startsWith("verben")) && fileName.endsWith(".xml")) { SAXReader reader = new SAXReader(); Document document = null; try { document = reader.read(new File(gnData.getAbsolutePath() + "/" + fileName)); } catch (DocumentException e) { StringBuffer sb = new StringBuffer(128); sb.append("SemanticClassLabelExtractor: error on reading GermaNet's files"); sb.append('\n').append("Aborting all operations!").append('\n'); sb.append("cause").append('\n'); sb.append(e.getMessage()); logger.error(sb.toString()); System.exit(1); } Element root = document.getRootElement(); // Extracting synsets List<?> synsets = root.elements("synset"); for (Object synset : synsets) { Element synsetElem = (Element) synset; String synsetID = synsetElem.attributeValue("id").substring(1); synsetMappings.put(Integer.parseInt(synsetID), fileName); // Extracting LUs List<?> lus = synsetElem.elements("lexUnit"); for (Object lu : lus) { Element luElem = (Element) lu; String luID = luElem.attributeValue("id").substring(1); luMappings.put(Integer.parseInt(luID), fileName); } } } } logger.info("Initializing SemanticClassLabelExtractor done"); } } }