/** * Copyright (c) 2014, the LESK-WSD-DSM AUTHORS. * * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright notice, this * list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * * Neither the name of the University of Bari nor the names of its contributors * may be used to endorse or promote products derived from this software without * specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. * * GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 * */ package di.uniba.it.wsd.tool.msc; import com.google.common.collect.HashMultiset; import com.google.common.collect.Multiset; import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Objects; import java.util.logging.Level; import java.util.logging.Logger; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; /** * Extract form MultiSemCor file information about synset probability and occurrences * @author pierpaolo */ public class MultiSemCorTransformer { private final Map<String, List<MscObject>> map = new HashMap<>(); private final Multiset<String> occ = HashMultiset.create(); private static final Logger logger = Logger.getLogger(MultiSemCorTransformer.class.getName()); private String convertWnsn(String wnsn) { StringBuilder sb = new StringBuilder(); sb.append(wnsn.substring(2)); sb.append(wnsn.substring(0, 1)); return sb.toString(); } private void processFile(File file) throws IOException, ParserConfigurationException, SAXException { logger.log(Level.INFO, "Processing file: {0}", file); DocumentBuilder docBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder(); Document parse = docBuilder.parse(file); NodeList structElems = parse.getElementsByTagName("struct"); for (int i = 0; i < structElems.getLength(); i++) { NodeList feats = structElems.item(i).getChildNodes(); String lemma = null; String wnsn = null; for (int j = 0; j < feats.getLength(); j++) { Node f = feats.item(j); if (f.getNodeType() == Node.ELEMENT_NODE && f.getNodeName().equals("feat")) { String type = f.getAttributes().getNamedItem("type").getNodeValue(); String value = f.getTextContent().trim(); if (type.equals("lemma")) { lemma = value; } else if (type.equals("wnsn")) { wnsn = value; } } } if (lemma != null && wnsn != null) { String convWnsn=convertWnsn(wnsn); occ.add(convWnsn); String key = lemma + "#" + wnsn.substring(0, 1); List<MscObject> list = map.get(key); if (list == null) { list = new ArrayList<>(); map.put(key, list); } int indexOf = list.indexOf(new MscObject(convWnsn)); if (indexOf >= 0) { list.get(indexOf).setScore(list.get(indexOf).getScore() + 1); } else { list.add(new MscObject(convWnsn, 1)); } } } } private void save(String outputDir) throws IOException { BufferedWriter writer = new BufferedWriter(new FileWriter(outputDir + "/sense.occ")); Iterator<Multiset.Entry<String>> iterator = occ.entrySet().iterator(); while (iterator.hasNext()) { Multiset.Entry<String> entry = iterator.next(); writer.append(entry.getElement()).append("\t").append(String.valueOf(entry.getCount())); writer.newLine(); } writer.close(); writer = new BufferedWriter(new FileWriter(outputDir + "/sense.freq")); Iterator<String> iterator1 = map.keySet().iterator(); while (iterator1.hasNext()) { String key = iterator1.next(); List<MscObject> list = map.get(key); float norm = 0; for (MscObject o : list) { norm += o.getScore(); } norm += (float) list.size(); for (MscObject o : list) { o.setScore((o.getScore() + 1) / (norm)); } writer.append(key); for (MscObject o : list) { writer.append("\t"); writer.append(o.toFileLine()); } writer.newLine(); } writer.close(); } private void processDir(File startDir, String outputPath) throws IOException, ParserConfigurationException, SAXException { map.clear(); occ.clear(); File[] listFiles = startDir.listFiles(); for (File file : listFiles) { processFile(file); } //save save(outputPath); } /** args1=MultiSemCor directory args2=output directory * @param args the command line arguments */ public static void main(String[] args) { try { MultiSemCorTransformer transformer = new MultiSemCorTransformer(); transformer.processDir(new File(args[0]), args[1]); } catch (IOException | ParserConfigurationException | SAXException ex) { Logger.getLogger(MultiSemCorTransformer.class.getName()).log(Level.SEVERE, null, ex); } } private class MscObject implements Comparable<MscObject> { private String id; private float score; public MscObject(String id, float score) { this.id = id; this.score = score; } public MscObject(String id) { this.id = id; } public String getId() { return id; } public void setId(String id) { this.id = id; } public float getScore() { return score; } public void setScore(float score) { this.score = score; } @Override public int hashCode() { int hash = 7; hash = 59 * hash + Objects.hashCode(this.id); return hash; } @Override public boolean equals(Object obj) { if (obj == null) { return false; } if (getClass() != obj.getClass()) { return false; } final MscObject other = (MscObject) obj; if (!Objects.equals(this.id, other.id)) { return false; } return true; } @Override public int compareTo(MscObject o) { return Float.compare(o.score, score); } public String toFileLine() { return id + "\t" + String.valueOf(score); } } }