/*******************************************************************************
* Copyright 2016
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package de.tudarmstadt.ukp.lmf.transform;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.List;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.hibernate.Query;
import org.hibernate.ScrollMode;
import org.hibernate.ScrollableResults;
import org.hibernate.Session;
import org.hibernate.criterion.DetachedCriteria;
import org.hibernate.criterion.Restrictions;
import org.xml.sax.SAXException;
import de.tudarmstadt.ukp.lmf.api.CriteriaIterator;
import de.tudarmstadt.ukp.lmf.model.core.LexicalEntry;
import de.tudarmstadt.ukp.lmf.model.core.LexicalResource;
import de.tudarmstadt.ukp.lmf.model.core.Lexicon;
import de.tudarmstadt.ukp.lmf.model.interfaces.IHasID;
import de.tudarmstadt.ukp.lmf.model.multilingual.PredicateArgumentAxis;
import de.tudarmstadt.ukp.lmf.model.multilingual.SenseAxis;
import de.tudarmstadt.ukp.lmf.model.semantics.SemanticPredicate;
import de.tudarmstadt.ukp.lmf.model.semantics.SynSemCorrespondence;
import de.tudarmstadt.ukp.lmf.model.semantics.Synset;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrame;
import de.tudarmstadt.ukp.lmf.model.syntax.SubcategorizationFrameSet;
/**
* Converts a given lexical resource from a UBY database to a UBY-XML file.
* @author Yevgen Chebotar
* @author Zijad Maksuti
* @author Christian M. Meyer
* @since UBY 0.1.0
*/
public class DBToXMLTransformer extends UBYHibernateTransformer {
private static final Log logger = LogFactory.getLog(DBToXMLTransformer.class);
protected LexicalResource lexicalResource;
protected DBConfig dbConfig;
/** Constructs a new {@link DBToXMLTransformer} instance which is used to
* convert UBY from a database to an XML file.
* @param dbConfig {@link DBConfig} instance used to access the database.
* @param outputPath the file path of the resulting XML file.
* @param dtdPath the file path of the DTD file. */
public DBToXMLTransformer(final DBConfig dbConfig, final String outputPath,
String dtdPath) throws FileNotFoundException, SAXException {
this(dbConfig, new FileOutputStream(outputPath), dtdPath);
this.dbConfig = dbConfig;
}
/** Constructs a new {@link DBToXMLTransformer} instance which is used to
* convert UBY from a database to an XML file.
* @param dbConfig {@link DBConfig} instance used to access the database.
* @param outputStream the (file) stream of the resulting XML data.
* @param dtdPath the file path of the DTD file. */
public DBToXMLTransformer(final DBConfig dbConfig,
final OutputStream outputStream, final String dtdPath)
throws SAXException {
super(dbConfig);
writeStartDocument(outputStream, dtdPath);
}
/**
* Transforms a {@link LexicalResource} instance retrieved from a database
* to a XML file.
*
* @param lexicalResource the lexical resource retrived from the database
*
* @throws SAXException if a severe error occurs when writing to a file
*
* @since UBY 0.1.0
*/
public void transform(final LexicalResource lexicalResource) throws SAXException {
openSession();
try {
String lexicalResourceName = lexicalResource.getName();
this.lexicalResource = (LexicalResource)session.get(LexicalResource.class, lexicalResourceName);
logger.info("Started writing lexicalResource " + lexicalResourceName);
doTransform(true, (Lexicon[]) null);
} finally {
closeSession();
}
}
public void transform(final LexicalResource lexicalResource,
final Lexicon lexicon) throws SAXException {
this.lexicalResource = lexicalResource;
openSession();
try {
doTransform(false, lexicon);
} finally {
closeSession();
}
}
/**
* Transforms a {@link LexicalResource} instance retrieved from a database
* to a XML file. The created XML only contains {@link Lexicon} instances which
* names are specified in the consumed {@link Set}. {@link SenseAxis} instances are omitted.
*
* @param lexicalResource the lexical resource retrieved from the database
*
* @param lexicons the set of names of lexicons which should be written to XML file
*
* @throws SAXException if a severe error occurs when writing to a file
*
* @since UBY 0.2.0
*
* @see #transform(LexicalResource)
* @see #transformSenseAxes(LexicalResource)
*/
public void transformLexicons(final LexicalResource lexicalResource,
final Set<String> lexicons) throws SAXException {
this.lexicalResource = lexicalResource;
openSession();
try {
doTransform(false, lexicons.toArray(new Lexicon[0]));
} finally {
closeSession();
}
}
/**
* Transforms a {@link LexicalResource} instance retrieved from a database
* to a XML file. The created XML only contains {@link SenseAxis} contained in the
* consumed lexical resource.
*
* @param lexicalResource the lexical resource retrieved from the database
*
* @throws SAXException if a severe error occurs when writing to a file
*
* @since UBY 0.2.0
*
* @see #transform(LexicalResource)
* @see #transformLexicons(LexicalResource, List)
*/
public void transformSenseAxes(final LexicalResource lexicalResource)
throws SAXException {
this.lexicalResource = lexicalResource;
openSession();
try {
doTransform(true, new Lexicon[0]);
} finally {
closeSession();
}
}
// lexicons = null (all lexicons), lexicons.length = 0 (no lexicons).
protected void doTransform(boolean includeAxes,
final Lexicon... includeLexicons) throws SAXException {
final int bufferSize = 100;
commitCounter = 1;
writeStartElement(lexicalResource);
// Iterate over all lexicons
if (includeLexicons == null || includeLexicons.length > 0) {
for (Lexicon lexicon : lexicalResource.getLexicons()) {
String lexiconName = lexicon.getName();
// Check if we want to include this lexicon.
if (includeLexicons != null) {
boolean found = false;
for (Lexicon l : includeLexicons) {
if (lexiconName.equals(l.getName())) {
found = true;
break;
}
}
if (!found) {
continue;
}
}
logger.info("Processing lexicon: " + lexiconName);
writeStartElement(lexicon);
// Iterate over all possible sub-elements of this Lexicon and
// write them to the XML
Class<?>[] lexiconClassesToSave = {
LexicalEntry.class,
SubcategorizationFrame.class,
SubcategorizationFrameSet.class,
SemanticPredicate.class,
Synset.class,
SynSemCorrespondence.class,
//ConstraintSet.class
};
// "Unfortunately, MySQL does not treat large offset values efficiently by default and will still read all the rows prior to an offset value. It is common to see a query with an offset above 100,000 take over 20 times longer than an offset of zero!"
// http://www.numerati.com/2012/06/26/reading-large-result-sets-with-hibernate-and-mysql/
for(Class<?> clazz : lexiconClassesToSave) {
/*DetachedCriteria criteria = DetachedCriteria.forClass(clazz)
.add(Restrictions.sqlRestriction("lexiconId = '" + lexicon.getId() + "'"));
CriteriaIterator<Object> iter = new CriteriaIterator<Object>(criteria, sessionFactory, bufferSize);
while (iter.hasNext()) {
Object obj = iter.next();
writeElement(obj);
session.evict(obj);
commitCounter++;
if (commitCounter % 1000 == 0)
logger.info("progress: " + commitCounter + " class instances written to file");
}*/
Session lookupSession = sessionFactory.openSession();
Query query = lookupSession.createQuery("FROM " + clazz.getSimpleName()
+ " WHERE lexiconId = '" + lexicon.getId() + "' ORDER BY id");
query.setReadOnly(true);
if (DBConfig.MYSQL.equals(dbConfig.getDBType())) {
query.setFetchSize(Integer.MIN_VALUE); // MIN_VALUE gives hint to JDBC driver to stream results
}
else {
query.setFetchSize(1000);
}
ScrollableResults results = query.scroll(ScrollMode.FORWARD_ONLY);
while (results.next()) {
// For streamed query results, no further queries are allowed (incl. lazy proxy queries!)
// Detach the object from the lookup session and reload it using the "official" session.
Object[] rows = results.get();
Object row = rows[0];
lookupSession.evict(row);
lookupSession.evict(rows);
rows = null;
row = session.get(row.getClass(), ((IHasID) row).getId());
writeElement(row);
session.evict(row);
row = null;
commitCounter++;
if (commitCounter % 1000 == 0) {
logger.info("progress: " + commitCounter + " class instances written to file");
}
if (commitCounter % 10000 == 0) {
closeSession();
openSession();
}
}
results.close();
lookupSession.close();
}
writeEndElement(lexicon);
}
}
// Iterate over SenseAxes and write them to XMLX when not only
// lexicons should be converted
if (includeAxes) {
logger.info("Processing sense axes");
DetachedCriteria criteria = DetachedCriteria.forClass(SenseAxis.class)
.add(Restrictions.sqlRestriction("lexicalResourceId = '" + lexicalResource.getName() + "'"));
CriteriaIterator<Object> iter = new CriteriaIterator<Object>(criteria, sessionFactory, bufferSize);
while (iter.hasNext()) {
Object obj = iter.next();
writeElement(obj);
session.evict(obj);
commitCounter++;
if (commitCounter % 1000 == 0) {
logger.info("progress: " + commitCounter + " class instances written to file");
}
}
logger.info("Processing predicateargument axes");
DetachedCriteria criteria2 = DetachedCriteria.forClass(PredicateArgumentAxis.class)
.add(Restrictions.sqlRestriction("lexicalResourceId = '" + lexicalResource.getName() + "'"));
CriteriaIterator<Object> iter2 = new CriteriaIterator<Object>(criteria2, sessionFactory, bufferSize);
while (iter2.hasNext()) {
Object obj = iter2.next();
writeElement(obj);
session.evict(obj);
commitCounter++;
if (commitCounter % 1000 == 0) {
logger.info("progress: " + commitCounter + " class instances written to file");
}
}
}
writeEndElement(lexicalResource);
writeEndDocument();
}
@Override
protected String getResourceAlias() {
return lexicalResource.getName();
}
}