/*
* Copyright 2012 FundaciĆ³ Barcelona Media
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.barcelonamedia.uima.reader.DBReader;
import java.io.IOException;
import java.sql.ResultSet;
import java.util.Hashtable;
import java.util.logging.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader_ImplBase;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.barcelonamedia.uima.reader.DBReader.DAO.DAOException;
import org.barcelonamedia.uima.reader.DBReader.DAO.DAOFactory;
import org.barcelonamedia.uima.reader.DBReader.DAO.DocumentDAO;
public class DBCollectionReader extends CollectionReader_ImplBase{
/** The logger object. */
private static final Logger logger = Logger.getLogger(DBCollectionReader.class.toString());
// Suported DBMS: -----------------------------------------
private static final String MySQL = "MySQL";
//----------------------------------------------------------
/** Correponds to a parameter that specifies DBMS to be used.
* The value of this variable is 'DBMS' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_DBMS = "DBMS";
/** Correponds to a parameter that specifies the server where DBMS is being hosted.
* The value of this variable is 'Server' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_SERVER = "Server";
/** Correponds to a parameter that specifies port to be used to connect to the specified DBMS.
* The value of this variable is 'Port' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_PORT = "Port";
/** Correponds to a parameter that specifies the name of the database to be used.
* The value of this variable is 'Database' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_DATABASE = "Database";
/** Correponds to a parameter that specifies the username fof the specified database.
* The value of this variable is 'User' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_USER = "User";
/** Correponds to a parameter that specifies the password of the specified database.
* The value of this variable is 'Password' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_PASSWORD = "Password";
/** Correponds to a parameter that specifies the sql statement to be executed in order to
* retrieve the proper information.
* The value of this variable is 'sql_select' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_SQL = "sql_select";
/** Correponds to an optional parameter that contains the language of the documents in the database.
* If not specified, the default system encoding will be used.
* The value of this variable is 'Language' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBCollectionReader/desc/DBCollectionReader"
**/
public static final String PARAM_LANGUAGE = "Language";
/** DAO Factory object. */
private DAOFactory daoFactory;
/** XMI DAO object. */
private DocumentDAO documentDAO;
private ResultSet documents;
private int documentSize;
private String language;
private int documentCounter;
/** ID of the document in process **/
private String documentID;
/** Document in process **/
private String documentData;
/**
* Initialize the component. Retrieve the parameters and process them,
* parsing the field descriptions and preparing the structures needed to
* process the documents.
*
* @param aContext
* The UIMA context.
*
* @throws ResourceInitializationException
* If an error occurs with some resource.
*
* @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
*/
public void initialize() throws ResourceInitializationException {
System.out.println("DBCollectionReader: initialize()...");
logger.info("initialize()...");
String dbms = (String) getUimaContext().getConfigParameterValue(PARAM_DBMS);
String server = (String) getUimaContext().getConfigParameterValue(PARAM_SERVER);
int port = (Integer) getUimaContext().getConfigParameterValue(PARAM_PORT);
String database = (String) getUimaContext().getConfigParameterValue(PARAM_DATABASE);
String user = (String) getUimaContext().getConfigParameterValue(PARAM_USER);
String password = (String) getUimaContext().getConfigParameterValue(PARAM_PASSWORD);
String sql_sentence = (String) getUimaContext().getConfigParameterValue(PARAM_SQL);
this.language = (String) getUimaContext().getConfigParameterValue(PARAM_LANGUAGE);
if((dbms == null || dbms.length() == 0) ||
(server == null || server.length() == 0) ||
(new Integer(port) == null) ||
(database == null || database.length() == 0) ||
(user == null || user.length() == 0) ||
(password == null || password.length() == 0) ||
(sql_sentence == null || sql_sentence.length() == 0)){
throw new ResourceInitializationException();
}
logger.info("initialize() - dbms: " + dbms);
logger.info("initialize() - server: " + server);
logger.info("initialize() - port: " + port);
logger.info("initialize() - database: " + database);
logger.info("initialize() - user: " + user);
logger.info("initialize() - password: " + password);
logger.info("initialize() - sql_sentence: " + sql_sentence);
if(dbms.equals(MySQL)){
System.out.println("DBCollectionReader: initialize() - Using MySQL as DBMS.");
this.daoFactory = DAOFactory.getDAOFactory(DAOFactory.MYSQL);
Hashtable<String, String> connectionParams = new Hashtable<String, String>();
connectionParams.put("server", server);
connectionParams.put("port", String.valueOf(port));
connectionParams.put("database", database);
connectionParams.put("user", user);
connectionParams.put("password", password);
this.documentDAO = this.daoFactory.getDocumentDAO(connectionParams);
this.documentDAO.setSQLSentence(sql_sentence);
try{
this.documents = this.documentDAO.getDocumentsText();
this.documentSize = this.documentDAO.getNumberOfDocumentsText();
}
catch (DAOException e){
throw new ResourceInitializationException(e);
}
}
this.documentCounter = 0;
logger.info("initialize() - Done.");
}
public void getNext(CAS aCAS) throws IOException, CollectionException {
JCas jcas;
try{
jcas = aCAS.getJCas();
}
catch(CASException e){
throw new CollectionException(e);
}
// put document in CAS
jcas.setDocumentText(documentData);
// set language if it was explicitly specified as a configuration parameter
if(this.language != null){
jcas.setDocumentLanguage(this.language);
}
// Also store location of source document in CAS. This information is critical
// if CAS Consumers will need to know where the original document contents are located.
// For example, the Semantic Search CAS Indexer writes this information into the
// search index that it creates, which allows applications that use the search index to
// locate the documents that satisfy their semantic queries.
SourceDocumentInformation srcDocInfo = new SourceDocumentInformation(jcas);
srcDocInfo.setUri("file:///" + this.documentID); // needs to be a valid URI with a known protocol
srcDocInfo.setOffsetInSource(0);
srcDocInfo.setDocumentSize((int) this.documentData.length());
this.documentCounter += 1;
srcDocInfo.setLastSegment(true);
srcDocInfo.addToIndexes();
}
public boolean hasNext() throws IOException, CollectionException {
try{
boolean hasNext = this.documents.next();
if(!hasNext){
this.documentDAO.closeConnection();
}
else{
//get document
this.documentData = this.documents.getString("text");
//get document id
this.documentID = this.documents.getString("id");
}
return hasNext;
}
catch(Exception e){
try{
// try to reconnect and continue.
this.documentDAO.getDocumentsTextFrom(this.documentID);
boolean hasNext = this.documents.next();
if(!hasNext){
this.documentDAO.closeConnection();
return false;
}
//get document
this.documentData = this.documents.getString("text");
//get document id
this.documentID = this.documents.getString("id");
return true;
}
catch(Exception E){
System.err.println("ERROR in reconnect");
throw new CollectionException(E);
}
}
}
public Progress[] getProgress(){
return new Progress[] { new ProgressImpl(this.documentCounter, this.documentSize, Progress.ENTITIES) };
}
public void close() throws IOException {
}
}