/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.Method;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.dom4j.Attribute;
import org.dom4j.DocumentException;
import org.dom4j.Element;
import org.dom4j.ElementHandler;
import org.dom4j.ElementPath;
import org.dom4j.io.SAXReader;
import org.xml.sax.EntityResolver;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import de.tudarmstadt.ukp.lmf.model.core.GlobalInformation;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.interfaces.IHasID;
import de.tudarmstadt.ukp.lmf.model.meta.MetaData;
import de.tudarmstadt.ukp.lmf.model.miscellaneous.ConstraintSet;
import de.tudarmstadt.ukp.lmf.model.multilingual.PredicateArgumentAxis;
import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.semantics.SynSemCorrespondence;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrameSet;
import de.tudarmstadt.ukp.lmf.transform.UBYLMFClassMetadata.UBYLMFFieldMetadata;
/**
* Converts a given lexical resource from a UBY-XML file to a UBY database
* using Hibernate.
* @author Yevgen Chebotar
* @author Christian M. Meyer
*/
public class XMLToDBTransformer extends UBYHibernateTransformer
implements ElementHandler {
protected LexicalResource lexicalResource; // Current lexical resource
protected Lexicon lexicon; // Current lexicon
protected boolean externalLexicalResource;
public XMLToDBTransformer(final DBConfig dbConfig) {
super(dbConfig);
}
/**
* Read xml File and save its contents to Database
* @param xmlFile
* @param lexicalResourceName
* @throws DocumentException
* @throws UbyInvalidArgumentException
*/
public void transform(File xmlFile, String lexicalResourceName) throws DocumentException, IllegalArgumentException{
long startTime = System.currentTimeMillis();
openSession();
if (lexicalResourceName != null) {
lexicalResource = (LexicalResource) session.get(LexicalResource.class, lexicalResourceName);
}
SAXReader reader = new SAXReader(false);
reader.setEntityResolver(new EntityResolver() {
@Override
public InputSource resolveEntity(String publicId, String systemId)
throws SAXException, IOException {
if (systemId.endsWith(".dtd")) {
return new InputSource(new StringReader(""));
}
return null;
}
});
reader.setDefaultHandler(this);
reader.read(xmlFile);
commit();
closeSession();
System.out.println("TOTAL TIME: " + (System.currentTimeMillis() - startTime));
System.out.println("NUM ENTRIES: " + commitCounter);
}
@Override
public void onStart(ElementPath epath) {
Element el = epath.getCurrent();
String n = el.getName();
// Remove empty attributes and invalid characters.
Iterator<?> attrIter = el.attributeIterator();
while (attrIter.hasNext()) {
Attribute attr = (Attribute) attrIter.next();
if ("NULL".equals(attr.getStringValue())) {
attrIter.remove();
}
else {
attr.setValue(StringUtils.replaceNonUtf8(attr.getValue()));
}
}
if ("LexicalResource".equals(n)) {
// If no lexical resource exists yet, create a new one.
if (lexicalResource == null){
lexicalResource = new LexicalResource();
lexicalResource.setName(el.attributeValue("name"));
lexicalResource.setDtdVersion(el.attributeValue("dtdVersion"));
session.save(lexicalResource);
}
else {
externalLexicalResource = true;
}
} else
if ("Lexicon".equals(n)) {
// Create a new, empty lexicon.
lexicon = new Lexicon();
lexicon.setId(el.attributeValue("id"));
lexicon.setName(el.attributeValue("name"));
lexicon.setLanguageIdentifier(el.attributeValue("languageIdentifier"));
lexicalResource.addLexicon(lexicon);
saveCascade(lexicon, lexicalResource);
}
// Save some global information if we're using a new lexical resource.
else if ("GlobalInformation".equals(n) && !externalLexicalResource) {
GlobalInformation glInformation = new GlobalInformation();
glInformation.setLabel(el.attributeValue("label"));
lexicalResource.setGlobalInformation(glInformation);
saveCascade(glInformation, lexicalResource);
commit();
lexicalResource.setGlobalInformation(null);
}
}
@Override
public void onEnd(ElementPath epath) {
Element el = epath.getCurrent();
String n = el.getName();
Object listElement = null;
// Create instances for all direct children of Lexicon.
if ("LexicalEntry".equals(n)) {
listElement = fromXmlToObject(el, LexicalEntry.class);
saveListElement(lexicon, lexicon.getLexicalEntries(), listElement);
} else
if ("SemanticPredicate".equals(n)) {
listElement = fromXmlToObject(el, SemanticPredicate.class);
saveListElement(lexicon, lexicon.getSemanticPredicates(), listElement);
} else
if ("SubcategorizationFrame".equals(n)) {
listElement = fromXmlToObject(el, SubcategorizationFrame.class);
saveListElement(lexicon, lexicon.getSubcategorizationFrames(), listElement);
} else
if ("SubcategorizationFrameSet".equals(n)) {
listElement = fromXmlToObject(el, SubcategorizationFrameSet.class);
saveListElement(lexicon, lexicon.getSubcategorizationFrameSets(), listElement);
} else
if ("SynSemCorrespondence".equals(n)) {
listElement = fromXmlToObject(el, SynSemCorrespondence.class);
saveListElement(lexicon, lexicon.getSynSemCorrespondences(), listElement);
} else
if ("Synset".equals(n)) {
listElement = fromXmlToObject(el, Synset.class);
saveListElement(lexicon, lexicon.getSynsets(), listElement);
} else
if ("ConstraintSet".equals(n)) {
listElement = fromXmlToObject(el, ConstraintSet.class);
saveListElement(lexicon, lexicon.getConstraintSets(), listElement);
} else
// Create instances for all direct children of LexicalResource.
if ("SenseAxis".equals(n)) {
listElement = fromXmlToObject(el, SenseAxis.class);
saveListElement(lexicalResource, lexicalResource.getSenseAxes(), listElement);
} else
if ("PredicateArgumentAxis".equals(n)) {
listElement = fromXmlToObject(el, PredicateArgumentAxis.class);
saveListElement(lexicalResource, lexicalResource.getPredicateArgumentAxes(), listElement);
} else
if ("MetaData".equals(n)) {
listElement = fromXmlToObject(el, MetaData.class);
saveListElement(lexicalResource, lexicalResource.getMetaData(), listElement);
}
// Forget the corresponding XML elements of the saved instances.
if (listElement != null) {
el.detach();
}
}
/**
* Transforms XML-Element and all its children to Java object
* @param el XML-Element
* @param clazz Java-Class of the Element
* @return
*/
protected Object fromXmlToObject(Element el, Class<?> clazz) {
try {
Object lmfObject = clazz.newInstance();
UBYLMFClassMetadata classMeta = getClassMetadata(clazz);
for (UBYLMFFieldMetadata fieldMeta : classMeta.getFields()) {
String xmlFieldName = fieldMeta.getName().replace("_", "");
Class<?> fieldType = fieldMeta.getType();
// Determine the field's value from the current XML element.
Object newValue = null;
switch (fieldMeta.getVarType()) {
case ATTRIBUTE:
case ATTRIBUTE_OPTIONAL:
String attrValue = el.attributeValue(xmlFieldName);
if (attrValue == null) {
continue;
}
newValue = attrValue;
if (fieldMeta.isBoolean()) {
newValue = GenericUtils.getBoolean(attrValue);
}
else
if (fieldMeta.isInteger()) {
newValue = GenericUtils.getInteger(attrValue);
}
else
if (fieldMeta.isDouble()) {
newValue = GenericUtils.getDouble(attrValue);
}
else
if (fieldMeta.isEnum()) {
newValue = GenericUtils.getEnum(fieldType, attrValue);
}
else
if (fieldMeta.isDate()){
newValue = GenericUtils.getDate(attrValue);
}
break;
case CHILD:
Element childEl = el.element(fieldType.getSimpleName());
if (childEl == null) {
continue;
}
newValue = fromXmlToObject(childEl, fieldType);
break;
case CHILDREN:
Class<?> elementClass = fieldMeta.getGenericElementType();
if (elementClass == null) {
throw new RuntimeException("Unable to obtain list element class for field " + fieldMeta.getName());
}
List<Object> childList = new ArrayList<Object>();
for (Object child : el.elements(elementClass.getSimpleName())) {
childList.add(fromXmlToObject((Element) child, elementClass));
}
newValue = childList;
break;
case IDREF:
String idref = el.attributeValue(xmlFieldName);
if (idref == null || idref.isEmpty()) {
continue;
}
IHasID obj = (IHasID) fieldType.newInstance();
obj.setId(idref);
newValue = obj;
break;
case IDREFS:
String idStr = el.attributeValue(xmlFieldName);
if (idStr == null || idStr.isEmpty()) {
continue;
}
elementClass = fieldMeta.getGenericElementType();
if (elementClass == null) {
throw new RuntimeException("Unable to obtain list element class for field " + fieldMeta.getName());
}
List<Object> idrefList = new ArrayList<Object>();
String ids[] = idStr.split(" ");
for (String id : ids) {
obj = (IHasID) elementClass.newInstance();
obj.setId(id);
idrefList.add(obj);
}
newValue = idrefList;
break;
case NONE:
continue;
}
// Save the new value using the setter method.
Method setter = fieldMeta.getSetter();
if (setter == null) {
throw new RuntimeException("Missing setter for : " + lmfObject.getClass() + "." + xmlFieldName);
}
setter.invoke(lmfObject, newValue);
}
return lmfObject;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
@Override
protected String getResourceAlias() {
return lexicalResource.getName();
}
}