/*
* Copyright 2012 FundaciĆ³ Barcelona Media
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.barcelonamedia.uima.reader.DBXMIReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.sql.ResultSet;
import java.util.Hashtable;
import java.util.logging.Logger;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;
import org.apache.commons.io.IOUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.barcelonamedia.uima.reader.DBXMIReader.DAO.DAOException;
import org.barcelonamedia.uima.reader.DBXMIReader.DAO.DAOFactory;
import org.barcelonamedia.uima.reader.DBXMIReader.DAO.XMIDAO;
import org.xml.sax.SAXException;
public class DBXMICollectionReader extends CollectionReader_ImplBase{
/** The logger object. */
private static final Logger logger = Logger.getLogger(DBXMICollectionReader.class.toString());
// Suported DBMS: -----------------------------------------
private static final String MySQL = "MySQL";
//----------------------------------------------------------
private static final int DEFAULT_NUM_OF_DOCUMENTS = 2;
/** Correponds to a parameter that specifies DBMS to be used.
* The value of this variable is 'DBMS' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_DBMS = "DBMS";
/** Correponds to a parameter that specifies the server where DBMS is being hosted.
* The value of this variable is 'Server' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_SERVER = "Server";
/** Correponds to a parameter that specifies port to be used to connect to the specified DBMS.
* The value of this variable is 'Port' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_PORT = "Port";
/** Correponds to a parameter that specifies the name of the database to be used.
* The value of this variable is 'Database' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_DATABASE = "Database";
/** Correponds to a parameter that specifies the username fof the specified database.
* The value of this variable is 'User' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_USER = "User";
/** Correponds to a parameter that specifies the password fof the specified database.
* The value of this variable is 'Password' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_PASSWORD = "Password";
/** Correponds to an optional parameter that contains the language of the documents in the database.
* If not specified, the default system encoding will be used.
* The value of this variable is 'sql_select' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_SQL = "sql_select";
/** Correponds to a parameter that specifies whether XMI is to be decompressed or not before inserting it into CAS.
* The value of this variable is 'compression' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_DO_DECOMPRESSION = "compression";
/** Correponds to a parameter that specifies whether checking of number of documents to be processed is to be done or not.
* This is useful for some cases in which SQL sentence are quite expensive.
* The value of this variable is 'enableDocCounter' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
private static final String PARAM_ENABLE_DOC_COUNTER = "enableDocCounter";
/** Name of the configuration parameter that must be set to indicate if the
* execution fails if an encountered type is unknown.
* The value of this variable is 'mFailOnUnknownType' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICollectionReader/desc/DBXMICollectionReader"
**/
public static final String PARAM_FAILUNKNOWN = "FailOnUnknownType";
/** DAO Factory object. */
private DAOFactory daoFactory;
/** XMI DAO object. */
private XMIDAO documentDAO;
/** XMI decompression flag **/
private Boolean do_decompression;
/** Flag which enables/disables number of documents retrieving **/
private Boolean retrieve_number_of_docs;
private ResultSet documents;
private Boolean mFailOnUnknownType;
private int documentSize;
private int currentIndex;
/** ID of the document being processed **/
private String documentID;
/** Documento in process **/
private InputStream documentData;
/**
* Initialize the component. Retrieve the parameters and process them,
* parsing the field descriptions and preparing the structures needed to
* process the documents.
*
* @param aContext
* The UIMA context.
*
* @throws ResourceInitializationException
* If an error occurs with some resource.
*
* @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
*/
public void initialize() throws ResourceInitializationException{
System.out.println("DBXMICollectionReader: initialize()...");
logger.info("initialize()...");
String dbms = (String) getUimaContext().getConfigParameterValue(PARAM_DBMS);
String server = (String) getUimaContext().getConfigParameterValue(PARAM_SERVER);
int port = (Integer) getUimaContext().getConfigParameterValue(PARAM_PORT);
String database = (String) getUimaContext().getConfigParameterValue(PARAM_DATABASE);
String user = (String) getUimaContext().getConfigParameterValue(PARAM_USER);
String password = (String) getUimaContext().getConfigParameterValue(PARAM_PASSWORD);
String sql_sentence = (String) getUimaContext().getConfigParameterValue(PARAM_SQL);
this.do_decompression = (Boolean) getUimaContext().getConfigParameterValue(PARAM_DO_DECOMPRESSION);
this.retrieve_number_of_docs = (Boolean) getUimaContext().getConfigParameterValue(PARAM_ENABLE_DOC_COUNTER);
this.mFailOnUnknownType = (Boolean) getUimaContext().getConfigParameterValue(PARAM_FAILUNKNOWN);
if((dbms == null || dbms.length() == 0) ||
(server == null || server.length() == 0) ||
(new Integer(port) == null) ||
(database == null || database.length() == 0) ||
(user == null || user.length() == 0) ||
(password == null || password.length() == 0) ||
(sql_sentence == null || sql_sentence.length() == 0)){
throw new ResourceInitializationException();
}
logger.info("initialize() - dbms: " + dbms);
logger.info("initialize() - server: " + server);
logger.info("initialize() - port: " + port);
logger.info("initialize() - database: " + database);
logger.info("initialize() - user: " + user);
logger.info("initialize() - password: " + password);
logger.info("initialize() - sql_sentence: " + sql_sentence);
if(dbms.equals(MySQL)){
System.out.println("DBXMICollectionReader: initialize() - Using MySQL as DBMS.");
this.daoFactory = DAOFactory.getDAOFactory(DAOFactory.MYSQL);
Hashtable<String, String> connectionParams = new Hashtable<String, String>();
connectionParams.put("server", server);
connectionParams.put("port", String.valueOf(port));
connectionParams.put("database", database);
connectionParams.put("user", user);
connectionParams.put("password", password);
this.documentDAO = this.daoFactory.getDocumentDAO(connectionParams);
this.documentDAO.setSQLSentence(sql_sentence);
try{
if(this.retrieve_number_of_docs){
this.documentSize = this.documentDAO.getNumberOfXMI();
}
else{
this.documentSize = DEFAULT_NUM_OF_DOCUMENTS;
}
this.documents = this.documentDAO.getXMI();
}
catch(DAOException e){
throw new ResourceInitializationException(e);
}
}
this.currentIndex = 0;
logger.info("initialize() - Done.");
}
public void getNext(CAS aCAS) throws IOException, CollectionException{
try{
if(this.do_decompression){
//Create the decompressor and give it the data to compress
Inflater decompressor = new Inflater();
byte[] documentDataByteArray = IOUtils.toByteArray(this.documentData);
decompressor.setInput(documentDataByteArray);
//Create an expandable byte array to hold the decompressed data
ByteArrayOutputStream bos = new ByteArrayOutputStream(documentDataByteArray.length);
//Decompress the data
byte[] buf = new byte[1024];
while(!decompressor.finished()){
try{
int count = decompressor.inflate(buf);
bos.write(buf, 0, count);
}
catch(DataFormatException e){
System.err.println("ERROR in Collection Reader " + e.getClass() + ": " + e.getMessage());
throw new IOException();
}
}
try{
bos.close();
}
catch(IOException e){
System.err.println("ERROR in Collection Reader " + e.getClass() + ": " + e.getMessage());
throw new IOException();
}
//Get the decompressed data
byte[] decompressedData = bos.toByteArray();
XmiCasDeserializer.deserialize(new ByteArrayInputStream(decompressedData), aCAS, ! this.mFailOnUnknownType);
}
else{
XmiCasDeserializer.deserialize(this.documentData, aCAS, ! this.mFailOnUnknownType);
}
this.currentIndex += 1;
}
catch(SAXException e){
System.err.println("ERROR in Collection Reader " + e.getClass() + ": " + e.getMessage());
throw new CollectionException(e);
}
}
public boolean hasNext() throws IOException, CollectionException{
try{
boolean hasNext = this.documents.next();
if(!hasNext){
this.documentDAO.closeConnection();
}
else{
//get document
this.documentData = this.documents.getBlob("xmi").getBinaryStream();
//get document id
this.documentID = this.documents.getString("id");
}
return hasNext;
}
catch(Exception e){
try{
// try to reconnect and continue.
this.documentDAO.getXMIFrom(this.documentID);
boolean hasNext = this.documents.next();
if(!hasNext){
this.documentDAO.closeConnection();
return false;
}
//get document
this.documentData = this.documents.getBlob("xmi").getBinaryStream();
//get document id
this.documentID = this.documents.getString("id");
return true;
}
catch(Exception E){
System.err.println("ERROR in reconnect");
throw new CollectionException(E);
}
}
}
public Progress[] getProgress(){
return new Progress[] { new ProgressImpl(this.currentIndex, this.documentSize, Progress.ENTITIES) };
}
public void close() throws IOException{
}
}