/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform.sensealignments;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.sql.SQLException;
import java.util.ArrayList;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.stream.XMLStreamException;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
/**
* Convert the FrameNet-WordNet alignment by Ferrandez et al. (2010) to UBY
* format.
*
* Reference: O. Ferrández, M. Ellsworth, R. Muñoz, and C. F. Baker: Aligning
* FrameNet and WordNet Based on Semantic Neighborhoods, LREC 2010
*
* @author Silvana Hartmann
*
* FOR UKP INTERNAL USE ONLY see header
*
*/
public class FramenetWordnetAlignmentFerrandez extends FramenetWordnetAlignment {
public FramenetWordnetAlignmentFerrandez(String sourceUrl, String destUrl,
String alignmentFile, String user, String pass)
throws SQLException, InstantiationException,
IllegalAccessException, ClassNotFoundException,
FileNotFoundException {
super(sourceUrl, destUrl, alignmentFile, user, pass);
}
/**
* Convert original alignment file to .tsv format Output format: fn-luId,
* wn-sensekey, wn lemma, fn lemma
*
* @param inFile
* location of original alignment file
* @param outFile
* output file
* @throws XMLStreamException
* @throws ParserConfigurationException
* @throws IOException
* @throws SAXException
*/
private static void convertFerrandezEtAl(String inFile, String outFile)
throws XMLStreamException, ParserConfigurationException,
SAXException, IOException {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder = factory.newDocumentBuilder();
Document doc = builder.parse(new File(inFile));
doc.getDocumentElement().normalize();
NodeList entries = doc.getElementsByTagName("alignment");// <alignment
ArrayList<String> output = new ArrayList<String>();
for (int i = 0; i < entries.getLength(); i++) {
Element alignment = (Element) entries.item(i);
NamedNodeMap atts = alignment.getAttributes();
String lemma = atts.getNamedItem("lemma").getTextContent();
lemma = lemma.replace("_", " ");
NodeList lus = alignment.getElementsByTagName("LU");// <LU
for (int k = 0; k < lus.getLength(); k++) {
Element fn = (Element) lus.item(k);
NamedNodeMap atts2 = fn.getAttributes();
String luId = atts2.getNamedItem("ID").getTextContent();
String fnLemma = atts2.getNamedItem("lemma").getTextContent();
fnLemma = fnLemma.replace("_", " "); // replace underscores in
// multiword lemmas by
// whitespace
Node wn = fn.getChildNodes().item(1);
String synsetId = wn.getAttributes().getNamedItem("ID").getNodeValue();
String synsetPos = wn.getAttributes().getNamedItem("PoS").getNodeValue();
NodeList senses = wn.getChildNodes();
for (int j = 0; j < senses.getLength(); j++) {
Node sense = senses.item(j);
String node = sense.getNodeName();
if (node.equals("word")) {
String wnLemma = sense.getTextContent();
wnLemma = wnLemma.replace("_", " ");
// need pos abbreviation for external reference id for
// WN synsets:
String pos = "n";
if (synsetPos.equals("noun")) {
pos = "n";
} else if (synsetPos.equals("adjective")) {
pos = "a";
} else if (synsetPos.equals("adverb")) {
pos = "r";
} else if (synsetPos.equals("verb")) {
pos = "v";
}
// remove semantic qualifiers from WN lemmas, such as
// "sleep ((quantity))":
// Problem: some compound-like structures are also
// filtered: "take ((upon))"
String wnCleanLemma = wnLemma.split("\\(")[0];
output.add(luId + "\t" + synsetId + "-" + pos + "\t"
+ wnCleanLemma + "\t" + fnLemma);
}
}
}
}
writeLines(outFile, output);
logger.info("# entries in original file " + inFile + ": "
+ entries.getLength());
logger.info("# output entries (written to " + outFile
+ "): " + output.size());
}
}