/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.uby.integration.alignment.xml.transform.sensealignments; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.LinkedList; import java.util.List; import java.util.TreeMap; import org.apache.commons.io.IOUtils; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import de.tudarmstadt.ukp.integration.alignment.xml.model.Alignments; import de.tudarmstadt.ukp.integration.alignment.xml.model.Decision; import de.tudarmstadt.ukp.integration.alignment.xml.model.Decisiontype; import de.tudarmstadt.ukp.integration.alignment.xml.model.ResourceXml; import de.tudarmstadt.ukp.integration.alignment.xml.model.Source; import de.tudarmstadt.ukp.integration.alignment.xml.model.Target; import de.tudarmstadt.ukp.integration.alignment.xml.model.XmlMeta; import de.tudarmstadt.ukp.lmf.api.Uby; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.transform.DBConfig; import de.tudarmstadt.ukp.uby.integration.alignment.xml.transform.SenseAlignmentGenericXml; /** * Convert the FrameNet-WordNet alignments to UBY format. This class takes the * FrameNet 1.5 and WordNet 3.0 ids from a file and integrates them to UBY * * @author Silvana Hartmann * */ public class FnWnSenseAlignmentXml extends SenseAlignmentXml { static String UBY_HOME = System.getenv("UBY_HOME"); static String DKPRO_HOME = System.getenv("DKPRO_HOME"); protected static Log logger = LogFactory .getLog(FnWnSenseAlignmentXml.class); private final Uby uby; ArrayList<String> notfoundWn = null; ArrayList<String> notfoundFn = null; ArrayList<String> notAddedAll; int inputsize = 0; /** * * @param alignmentFile * @param outFile * @param dbConfig */ public FnWnSenseAlignmentXml(String alignmentFile, String outFile, DBConfig dbConfig) { super(alignmentFile, outFile); uby = new Uby(dbConfig); notfoundFn = new ArrayList<String>(); notfoundWn = new ArrayList<String>(); notAddedAll = new ArrayList<String>(); } /** * Collect UBY SenseIds for the aligned senses based on synsetId and lemma * for WordNet and based on lexical unit id for FrameNet * * @throws IOException */ @Override public void toAlignmentXml(XmlMeta metadata) throws IOException { System.err.println("to Alignment Xml"); TreeMap<String, Source> sourceMap = new TreeMap<>(); List<String[]> data = null; data = readAlignmentFile(); int counter = 0; // input sense pairs int found = 0; // output sense pairs // iterate over alignment entries for (String[] d : data) { counter++; // show progress: if ((counter % 1000) == 0) { logger.info("# processed alignments: " + counter); } // use FrameNet sense externalReference (lexical unit Id) String fnSenseId = d[0]; // SOURCE Source source = null; if (sourceMap.containsKey(fnSenseId)) { source = sourceMap.get(fnSenseId); } else { source = new Source(); } source.ref = fnSenseId; List<Target> targets = new LinkedList<Target>(); // get WordNet sense by Synset Offset and Lemma List<Sense> wnSenses = uby.getSensesByWNSynsetId(d[1]); // List<Sense> wnSenses = uby.wordNetSenses(partOfSpeech, offset); for (Sense wnSense : wnSenses) { Target target = new Target(); target.ref = wnSense.getId(); Decision decision = new Decision(); decision.confidence = SenseAlignmentGenericXml.DEFAULTCONFSCORE; decision.value = true; // decision.src = metadata.decisiontypes.get(0).name; target.decision = decision; targets.add(target); found++; } if (targets.size() > 0) { source.targets = targets; sourceMap.put(source.ref, source); } } writer.writeMetaData(metadata); Alignments alignments = new Alignments(); alignments.source = new LinkedList<>(); alignments.source.addAll(sourceMap.values()); writer.writeAlignments(alignments); writer.close(); System.err.println("Alignments in: " + counter + " OUT" + found); logger.info("Alignments in: " + counter + "Alignments out: " + found); } /** * Read alignment file in standard format, e.g.: fn_luId, wn_synset ID, * wn_lemma, fn_lemma * * @return * @throws IOException */ private List<String[]> readAlignmentFile() { List<String[]> alignment = new ArrayList<String[]>(); int lineNumber = 0; BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(alignmentFile)); String line = null; while ((line = reader.readLine()) != null) { lineNumber++; String[] items = line.split("\t"); alignment.add(items); } } catch (FileNotFoundException e) { System.err.println("File not found: " + alignmentFile); e.printStackTrace(); } catch (IOException e) { System.err.println("File could not be opended: " + alignmentFile); IOUtils.closeQuietly(reader); } inputsize = lineNumber; return alignment; } /** * Write output lines to given file * * @param outFile * @param lines * @throws IOException */ protected static void writeLines(String outFile, Collection<String> lines) throws IOException { BufferedWriter writer = null; try { writer = new BufferedWriter(new FileWriter(new File(outFile))); for (String line : lines) { writer.write(line + "\n"); } } catch (IOException e) { System.err .println("Exception" + e + "could not write to" + outFile); } finally { if (writer != null) { writer.close(); } } } @Override public XmlMeta getDefaultXmlMeta() { XmlMeta metadata = new XmlMeta(); metadata.title = "FrameNet - WordNet sense alignment from WordFrameNet "; metadata.creator = "http://adimen.si.ehu.es/web/WordFrameNet"; metadata.date = "2010-03-23"; // from download metadata.description = "WordFrameNet: "; metadata.identifier = "FNWNwfn"; metadata.publisher = "Laparra E. and Rigau G"; metadata.rights = "http://creativecommons.org/licenses/by/3.0/"; metadata.version = "TODO"; ResourceXml targetResource = new ResourceXml(); targetResource.description = "WordNet version 3.x"; // TODO check // matches lexiconId targetResource.id = "WN_Lexicon_0"; targetResource.language = "en"; // matches externalSystem targetResource.identifiertype = SenseAlignmentGenericXml.UBY_SENSE_ID; metadata.targetResource = targetResource; ResourceXml sourceResource = new ResourceXml(); sourceResource.description = "FrameNet version 1.x"; // TODO check // matches lexiconId sourceResource.id = "FN_Lexicon_0"; sourceResource.language = "en"; // matches externalSystem sourceResource.identifiertype = "FrameNet_1.5_eng_lexicalUnit"; metadata.sourceResource = sourceResource; Decisiontype type = new Decisiontype(); type.id = "WFN_FNWN"; type.name = "WFN_FNWN"; type.type = Decisiontype.Decision.AUTOMATIC; List<Decisiontype> decisionTypes = new ArrayList<>(); decisionTypes.add(type); metadata.decisiontypes = decisionTypes; // no separate scores given => no scoretype information return metadata; } public static void main(String[] args) throws Exception { String UBY_HOME = System.getenv("UBY_HOME"); String alignmentFile = UBY_HOME + "/alignments/WordFrameNet_formatted.tsv"; String outFile = UBY_HOME + "/target/wordFrameNet_newXml.xml"; DBConfig dbConfig = new DBConfig("localhost/uby_clarin_0_7_0", "com.mysql.jdbc.Driver", "mysql", "root", "pass", false); FnWnSenseAlignmentXml al = new FnWnSenseAlignmentXml( alignmentFile, outFile, dbConfig); al.toAlignmentXml(al.getDefaultXmlMeta()); } }