/******************************************************************************* * Copyright 2016 * Ubiquitous Knowledge Processing (UKP) Lab * Technische Universität Darmstadt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. ******************************************************************************/ package de.tudarmstadt.ukp.uby.integration.alignment.xml.transform; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.LinkedList; import java.util.List; import java.util.TreeMap; import javax.xml.transform.TransformerException; import org.dom4j.DocumentException; import org.xml.sax.SAXException; import de.tudarmstadt.ukp.integration.alignment.xml.model.Decisiontype; import de.tudarmstadt.ukp.integration.alignment.xml.model.Source; import de.tudarmstadt.ukp.integration.alignment.xml.model.Target; import de.tudarmstadt.ukp.lmf.model.core.GlobalInformation; import de.tudarmstadt.ukp.lmf.model.core.LexicalResource; import de.tudarmstadt.ukp.lmf.model.core.Lexicon; import de.tudarmstadt.ukp.lmf.model.core.Sense; import de.tudarmstadt.ukp.lmf.model.enums.ESenseAxisType; import de.tudarmstadt.ukp.lmf.model.meta.MetaData; import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis; import de.tudarmstadt.ukp.lmf.transform.DBConfig; import de.tudarmstadt.ukp.lmf.transform.LMFXmlWriter; /** * Create uby lexical resource containing sense axes * directly from generic alignment xml file * Replaces SenseAlignment and children of SenseAlignment */ public class SenseAlignmentGenericXml extends AlignmentGenericXml { private final TreeMap<String, SenseAxis> senseAxisMap; /* protected final static Log logger = LogFactory .getLog(SenseAlignmentGenericXml.class); */ public SenseAlignmentGenericXml(String sourceUrl, String dbDriver, String dbVendor, String alignmentFile, String user, String pass) { super(sourceUrl, dbDriver, dbVendor, alignmentFile, user, pass); senseAxisMap = new TreeMap<>(); } public SenseAlignmentGenericXml(DBConfig dbconf, String alignmentFile) { super(dbconf,alignmentFile); senseAxisMap = new TreeMap<>(); } /** * Convert sense alignment in generic alignment xml format to LMF * * @param idPrefix * @throws ParseException */ @Override public void getAlignment(String idPrefix) throws ParseException { logger.info("looking up alignment"); // expect single decisiontype Decisiontype decisiontype = metadata.decisiontypes.get(0); String sourceType = metadata.sourceResource.identifiertype; String destType = metadata.targetResource.identifiertype; /* determine sense axis type based on languages */ ESenseAxisType senseAxisType = null; if (metadata.sourceResource.language .equals(metadata.targetResource.language)) { senseAxisType = ESenseAxisType.monolingualSenseAlignment; } else { senseAxisType = ESenseAxisType.crosslingualSenseAlignment; } MetaData meta = new MetaData(); DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd"); Date d = formatter.parse(metadata.date); meta.setCreationDate(d); meta.setId(metadata.identifier); meta.setVersion(metadata.version); meta.setAutomatic(decisiontype.type == Decisiontype.Decision.AUTOMATIC); meta.setCreationProcess(decisiontype.id); meta.setCreationTool(metadata.description); lmfMetaData.add(meta); Lexicon sourceLexicon = uby .getLexiconById(metadata.sourceResource.id); Lexicon destLexicon = uby .getLexiconById(metadata.targetResource.id); int id = 0; /* Lookup of alignments in UBY */ for (Source source : alignments) { List<Sense> sourceSenses = getSenses(sourceType, source.ref, sourceLexicon); for (Target target : source.targets) { // only add "positive" alignments for now! - nonalignments are // not modeled if (target.decision.value == true) { List<Sense> destSenses = getSenses(destType, target.ref, destLexicon); for (Sense sourceSense : sourceSenses) { for (Sense destSense : destSenses) { if (destSense != null && sourceSense != null) { // avoid duplicates if (!senseAxisMap.containsKey(sourceSense .getId() + "%%" + destSense.getId())) { SenseAxis axis = new SenseAxis(); axis.setId(idPrefix + "_" + id); // set confidence score if available if (target.decision.confidence != null) { axis.setConfidence(target.decision.confidence); } else { axis.setConfidence(DEFAULTCONFSCORE); } axis.setLexiconOne(sourceLexicon); axis.setLexiconTwo(destLexicon); axis.setMetaData(meta); axis.setSenseAxisType(senseAxisType); axis.setSenseOne(sourceSense); axis.setSenseTwo(destSense); if (sourceType.equals(UBY_SYNSET_ID)) { axis.setSynsetOne(sourceSense .getSynset()); } if (destType.equals(UBY_SYNSET_ID)) { axis.setSynsetTwo(destSense.getSynset()); } // axis.setSenseAxisRelations(senseAxisRelations); senseAxisMap.put(sourceSense.getId() + "%%" + destSense.getId(), axis); id++; } else { System.err.println("catching duplicates " + sourceSense.getId() + "%%" + destSense.getId()); } } else { logString.append("No alignment for: " + source.ref + " " + target.ref); logString.append(LF); nullAlignment++; } } } } } } logString.append("So many input id pairs could not be aligned: " + nullAlignment); logger.info(logString.toString()); } /** * Write sense alignments to UBY LMF xml * * @param idPrefix * @param dtdVersion * @throws IOException * @throws TransformerException * @throws SAXException */ @Override public void toLMF(String idPrefix, String dtdVersion, String outfile) throws IOException, TransformerException, SAXException { LMFXmlWriter xmlWriter = new LMFXmlWriter(outfile, UBY_HOME + "/resources/dtd/DTD_unifiedModel_" + dtdVersion + ".dtd"); LexicalResource lexicalResource = new LexicalResource(); List<MetaData> metaDataList = new ArrayList<MetaData>( lmfMetaData.size()); int i = 0; for (MetaData meta : lmfMetaData) { meta.setId(idPrefix + "_Meta_" + i); metaDataList.add(meta); i++; } lexicalResource.setMetaData(metaDataList); // set metadata for // lexicalresource! LinkedList<SenseAxis> senseaxes = new LinkedList<>(); senseaxes.addAll(senseAxisMap.values()); lexicalResource.setSenseAxes(senseaxes); lexicalResource.setDtdVersion(dtdVersion); lexicalResource.setName("Uby_Alignments_" + idPrefix); GlobalInformation globalInformation = new GlobalInformation(); globalInformation.setLabel("Alignments_" + idPrefix); lexicalResource.setGlobalInformation(globalInformation); xmlWriter.writeElement(lexicalResource); xmlWriter.writeEndDocument(); } /** * Read UBY LMF XML to database * * @param dbConfig * @param xmlSource * @param idPrefix * @throws DocumentException * @throws FileNotFoundException * @throws IllegalArgumentException */ public static void toDB(DBConfig dbConfig, File xmlSource, String idPrefix) throws DocumentException, FileNotFoundException, IllegalArgumentException { convertToDB(dbConfig, xmlSource, "Uby_Alignments_" + idPrefix); } }