/*
* Copyright 2012 Fundació Barcelona Media
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.barcelonamedia.uima.consumer.SQLAnnotationsConsumer;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.collection.CasConsumer_ImplBase;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceProcessException;
import org.apache.uima.util.ProcessTrace;
import org.barcelonamedia.uima.consumer.SQLAnnotationsConsumer.DAO.DAOException;
import org.barcelonamedia.uima.consumer.SQLAnnotationsConsumer.DAO.DAOFactory;
import org.barcelonamedia.uima.consumer.SQLAnnotationsConsumer.DAO.FeaturesDAO;
import org.barcelonamedia.uima.consumer.SQLAnnotationsConsumer.DTO.FeaturesDTO;
import org.barcelonamedia.uima.consumer.features.FeatureInfoSet;
import org.barcelonamedia.uima.consumer.features.FeatureInfoSetManager;
import org.barcelonamedia.uima.consumer.features.FilteringFeatureInfoSet;
public class DBAnnotationsCASConsumer extends CasConsumer_ImplBase{
/** The logger object. */
private static final Logger logger = Logger.getLogger(DBAnnotationsCASConsumer.class.toString());
// Suported DBMS: -----------------------------------------
private static final String MySQL = "MySQL";
//----------------------------------------------------------
/** Correponds to a parameter that specifies DBMS to be used.
* The value of this variable is 'DBMS' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_DBMS = "DBMS";
/** Correponds to a parameter that specifies the server where DBMS is being hosted.
* The value of this variable is 'Server' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_SERVER = "Server";
/** Correponds to a parameter that specifies port to be used to connect to the specified DBMS.
* The value of this variable is 'Port' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_PORT = "Port";
/** Correponds to a parameter that specifies the name of the database to be used.
* The value of this variable is 'Database' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_DATABASE = "Database";
/** Correponds to a parameter that specifies the username fof the specified database.
* The value of this variable is 'User' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_USER = "User";
/** Correponds to a parameter that specifies the password fof the specified database.
* The value of this variable is 'Password' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_PASSWORD = "Password";
/** Correponds to a parameter that specifies the database table to be used.
* The value of this variable is 'table' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String PARAM_TABLE = "table";
/** Correponds to a parameter that specifies the annotation to be used as each database row entry.
* The value of this variable is 'seg_type' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String SEG_TYPE_NAME = "seg_type";
/** Correponds to a parameter that specifies the features to retrieve to be registered in the database.
* The value of this variable is 'features' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String FEATURES = "features";
/** Correponds to a parameter that specifies the names of the relational databases columns for the different features.
* (features_display_names[n] name refers to features[n] feature).
* The value of this variable is 'features_display_names' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String FEATURES_COLUMN_NAMES = "features_column_names";
/** Correponds to a parameter that specifies the character to be used to concatenate different features.
* (features_concat_chars[n] concat character will be used for features[n] feature).
* The value of this variable is 'features_concat_chars' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String FEATURES_CONCAT_CHARS = "features_concat_chars";
/** Correponds to a parameter that specifies the character to be used to replace white spaces of features value.
* (features_whitespace_char[n] concat character will be used for features[n] feature).
* The value of this variable is 'features_whitespace_char' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBAnnotationsCASConsumer/desc/DBAnnotationsCASConsumer.xml"
**/
private static final String FEATURES_WHITESPACE_CHARS = "features_whitespace_char";
/** Correponds to a parameter that specifies whether whole document URI or document name is to be used as document ID into the database.
* The value of this variable is 'fullURI' which is the name of
* the parameter in the descriptor file that must be set.
* @see "/DBXMICASConsumer/desc/DBXMICASConsumer.xml"
**/
private static final String FULL_URI = "fullURI";
/** Correponds to a parameter that specifies whether to use an existing table
* (if it exists, otherwise a new one is created)
**/
private static final String USE_EXISTING_TABLE = "useExistingTable";
/**
* Key word for specifying that white space is to be used for concatenating features or tu replace white spaces into a feature value
* (This parameter is used because simple white space is not allowed in an XML tag)
*/
private static final String WHITE_SPACE_RESERVED_KEY = "BLANK";
/** Name corresponding to the database table column containing document id */
private static final String DOC_ID = "doc_id";
/** 'ficticious' feature ID for anotación coveredText */
private static final String ANNOTATION_ID = "id";
/** 'ficticious' feature name for anotación id */
private static final String ANNOTATION_COVERED_TEXT = "coveredText";
/** SEG_TYPE_NAME configuration parameter value */
private String seg_type_name;
/** Handler of the features to be extracted **/
FeatureInfoSetManager featureInfoSetManager;
/** Annotation type used for splitting **/
private Type seg_type;
/** Full URI flag **/
private Boolean fullURI;
/** Use existing table flag **/
private Boolean useExistingTable;
/** DAO Factory object. */
private DAOFactory daoFactory;
/** XMI DAO object. */
private FeaturesDAO featuresDAO;
public void initialize() throws ResourceInitializationException{
System.out.println("DBAnnotationsCASConsumer: initialize()...");
logger.info("DBAnnotationsCASConsumer: initialize()...");
String dbms = (String) getUimaContext().getConfigParameterValue(PARAM_DBMS);
String server = (String) getUimaContext().getConfigParameterValue(PARAM_SERVER);
int port = (Integer) getUimaContext().getConfigParameterValue(PARAM_PORT);
String database = (String) getUimaContext().getConfigParameterValue(PARAM_DATABASE);
String user = (String) getUimaContext().getConfigParameterValue(PARAM_USER);
String password = (String) getUimaContext().getConfigParameterValue(PARAM_PASSWORD);
String table = (String) getUimaContext().getConfigParameterValue(PARAM_TABLE);
this.seg_type_name = (String) getUimaContext().getConfigParameterValue(SEG_TYPE_NAME);
String[] features = (String[])getUimaContext().getConfigParameterValue(FEATURES);
String[] features_column_names = (String[])getUimaContext().getConfigParameterValue(FEATURES_COLUMN_NAMES);
String[] features_concat_chars = (String[])getUimaContext().getConfigParameterValue(FEATURES_CONCAT_CHARS);
String[] features_whitespaces_chars = (String[])getUimaContext().getConfigParameterValue(FEATURES_WHITESPACE_CHARS);
this.fullURI = (Boolean) getUimaContext().getConfigParameterValue(FULL_URI);
this.useExistingTable = (Boolean) getUimaContext().getConfigParameterValue(USE_EXISTING_TABLE);
if((dbms == null || dbms.length() == 0) ||
(server == null || server.length() == 0) ||
(new Integer(port) == null) ||
(database == null || database.length() == 0) ||
(user == null || user.length() == 0) ||
(password == null || password.length() == 0) ||
(table == null || table.length() == 0) ||
(this.seg_type_name == null || this.seg_type_name.length() == 0) ||
(features == null || features.length == 0) ||
(features_column_names == null || features_column_names.length == 0) ||
(features_concat_chars != null && features_concat_chars.length == 0) ||
(features_whitespaces_chars != null && features_whitespaces_chars.length == 0)){
throw new ResourceInitializationException();
}
if(!((features.length == features_column_names.length &&
(features_concat_chars != null && features_column_names.length == features_concat_chars.length) &&
(features_whitespaces_chars != null && features_column_names.length == features_whitespaces_chars.length)))){
logger.log(Level.SEVERE, "DBAnnotationsCASConsumer :: initialize() :: ERROR: array params sizes mismatched.");
System.err.println("ERROR: DBAnnotationsCASConsumer :: initialize() :: Error: array params sizes mismatched.");
throw new ResourceInitializationException();
}
logger.info("DBAnnotationsCASConsumer: initialize() - dbms: " + dbms);
logger.info("DBAnnotationsCASConsumer: initialize() - server: " + server);
logger.info("DBAnnotationsCASConsumer: initialize() - port: " + port);
logger.info("DBAnnotationsCASConsumer: initialize() - database: " + database);
logger.info("DBAnnotationsCASConsumer: initialize() - user: " + user);
logger.info("DBAnnotationsCASConsumer: initialize() - password: " + password);
logger.info("DBAnnotationsCASConsumer: initialize() - table: " + table);
logger.info("DBAnnotationsCASConsumer: initialize() - seg_type_name: " + this.seg_type_name);
logger.info("DBAnnotationsCASConsumer: initialize() - features: " + features.toString());
logger.info("DBAnnotationsCASConsumer: initialize() - features column names: " + features_column_names.toString());
if(features_concat_chars != null){
logger.info("DBAnnotationsCASConsumer: initialize() - features concat chars: " + features_concat_chars.toString());
//Updates white spaces specified with the key work WHITE_SPACE_RESERVED_KEY
for(int i=0; i<features_concat_chars.length; i++){
if(features_concat_chars[i].equals(WHITE_SPACE_RESERVED_KEY)){
features_concat_chars[i] = " ";
}
}
}
if(features_whitespaces_chars != null){
logger.info("DBAnnotationsCASConsumer: initialize() - features whitespaces chars: " + features_whitespaces_chars.toString());
//Updates white spaces specified with the key work WHITE_SPACE_RESERVED_KEY
for(int i=0; i<features_whitespaces_chars.length; i++){
if(features_whitespaces_chars[i].equals(WHITE_SPACE_RESERVED_KEY)){
features_whitespaces_chars[i] = " ";
}
}
}
logger.info("DBAnnotationsCASConsumer: initialize() - full URI as doc Id: " + this.fullURI);
logger.info("DBAnnotationsCASConsumer: initialize() - use existing table (if available): " + this.useExistingTable);
this.featureInfoSetManager = new FeatureInfoSetManager(features, features_column_names, features_concat_chars, features_whitespaces_chars);
this.featureInfoSetManager.buildFeaturesInfoSet();
if(dbms.equals(MySQL)){
System.out.println("DBAnnotationsCASConsumer: initialize() - Using MySQL as DBMS.");
this.daoFactory = DAOFactory.getDAOFactory(DAOFactory.MYSQL);
Hashtable<String, String> connectionParams = new Hashtable<String, String>();
connectionParams.put("server", server);
connectionParams.put("port", String.valueOf(port));
connectionParams.put("database", database);
connectionParams.put("user", user);
connectionParams.put("password", password);
this.featuresDAO = this.daoFactory.getFeaturesDAO(connectionParams);
try{
Hashtable<String, String> tableInfo = new Hashtable<String, String>();
tableInfo.put("table", table);
tableInfo.put("doc_id", DOC_ID);
tableInfo.put("features", this.featureInfoSetManager.getFeaturesDatabaseColumns());
this.featuresDAO.setTableInfo(tableInfo);
this.featuresDAO.createTable(tableInfo, this.useExistingTable);
this.featuresDAO.init();
}
catch(DAOException e){
logger.log(Level.SEVERE, e.getMessage());
System.err.println("DBAnnotationsCASConsumer: ERROR in CAS Consumer " +e.getClass() + " with message:" + e.getMessage());
throw new ResourceInitializationException(e);
}
}
logger.info("DBAnnotationsCASConsumer: initialize() - Done.");
}
/**
* Initializes the type system.
*/
public void typeSystemInit(TypeSystem typeSystem){
System.out.println("DBAnnotationsCASConsumer: typeSystemInit() - Loading provided types...");
this.seg_type = typeSystem.getType(this.seg_type_name);
System.out.println("DBAnnotationsCASConsumer: typeSystemInit() - Types loaded.");
}
/**
* Processes the CAS which was populated by the TextAnalysisEngines. <br>
* In this case, the CAS is converted to XMI and written into the output file .
*
* @param aCAS
* a CAS which has been populated by the TAEs
*
* @throws ResourceProcessException
* if there is an error in processing the Resource
*
* @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
*/
public void processCas(CAS aCAS) throws ResourceProcessException{
String doc_id = new String();
try{
JCas jcas = aCAS.getJCas();
FSIterator<Annotation> it = jcas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
if(it.hasNext()){
SourceDocumentInformation sdi = (SourceDocumentInformation) it.next();
if(this.fullURI){
doc_id = sdi.getUri().toString();
}
else{
doc_id = new File(new URL(sdi.getUri()).getPath()).getName();
}
if(sdi.getOffsetInSource() > 0 || !sdi.getLastSegment()){
doc_id += ("_" + sdi.getOffsetInSource() + "_" + sdi.getDocumentSize());
}
}
}
catch(Exception e){
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceProcessException(e);
}
Iterator<AnnotationFS> seg_type_iterator = aCAS.getAnnotationIndex(this.seg_type).iterator();
while(seg_type_iterator.hasNext()){
Annotation seg_annotation = (Annotation)seg_type_iterator.next();
int seg_ann_begin = seg_annotation.getBegin();
int seg_ann_end = seg_annotation.getEnd();
for(int i=0; i < this.featureInfoSetManager.getFeatureInfoSetList().size(); i++){
FeatureInfoSet featureInfoSet = this.featureInfoSetManager.getFeatureInfoSetList().get(i);
FilteringFeatureInfoSet filteringFeatureInfoSet = featureInfoSet.getFilteringFeatureInfoSet();
//Retrieves features to be extracted and features to filter
String feature_qualified_name = featureInfoSet.getQualifiedName();
String type_name = featureInfoSet.getType();
Type annotation_type = aCAS.getTypeSystem().getType(type_name);
Feature feature = aCAS.getTypeSystem().getFeatureByFullName(feature_qualified_name);
Feature filtering_feature = null;
if(filteringFeatureInfoSet != null){
filtering_feature = aCAS.getTypeSystem().getFeatureByFullName(filteringFeatureInfoSet.getQualifiedName());
}
//Process id and coveredText "particular features"
boolean idFeature = false;
boolean coveredTextFeature = false;
String feature_name = featureInfoSet.getName();
if(feature_name.equals(ANNOTATION_ID)){
idFeature = true;
}
if(feature_name.equals(ANNOTATION_COVERED_TEXT)){
coveredTextFeature = true;
}
//Feature extraction process...
if(feature != null || idFeature || coveredTextFeature){
//If annotation is not the splitting annotation...
if(annotation_type != this.seg_type){
Iterator<AnnotationFS> annotation_iterator = aCAS.getAnnotationIndex(annotation_type).iterator();
while(annotation_iterator.hasNext()){
Annotation annotation = (Annotation)annotation_iterator.next();
//Filters for avoiding loop over annotation subclasses
if(annotation.getType().getName().equals(annotation_type.toString())){
int ann_begin = annotation.getBegin();
int ann_end = annotation.getEnd();
//If annotation overlap...
if((ann_begin < seg_ann_end) && (ann_end > seg_ann_begin)){
if(!idFeature && !coveredTextFeature && annotation.getFeatureValueAsString(feature) == null){
featureInfoSet.addFeatureValues(i, "");
}
else{
boolean filterPassed = false;
if(filtering_feature != null){
if(filteringFeatureInfoSet.getMatcher().reset(annotation.getFeatureValueAsString(filtering_feature)).find()){
filterPassed = true;
}
}
if((filteringFeatureInfoSet!=null && filterPassed) || filteringFeatureInfoSet==null){
if(idFeature){
featureInfoSet.addFeatureValues(i, String.valueOf(annotation.hashCode()));
}
else if(coveredTextFeature){
String coveredText = annotation.getCoveredText();
//Replace white spaces by the feature value delimiter
ArrayList<String> splittedCoveredTextArray = new ArrayList<String>();
String[] splittedCoveredText = coveredText.split(" ");
Collections.addAll(splittedCoveredTextArray, splittedCoveredText);
featureInfoSet.addFeatureValues(i, splittedCoveredTextArray);
}
else{
String featureValue = annotation.getFeatureValueAsString(feature);
//Replace white spaces by the feature value delimiter
ArrayList<String> splittedFeatureValueArray = new ArrayList<String>();
if (featureValue != null) {
String[] splittedFeatureValue = featureValue.split(" ");
Collections.addAll(splittedFeatureValueArray, splittedFeatureValue);
} else {
logger.severe(String.format("missing feature: %s:%s (doc: %s, <%s>)",type_name,feature_name,doc_id,seg_annotation.getCoveredText()));
}
featureInfoSet.addFeatureValues(i, splittedFeatureValueArray);
}
}
}
}
else if(ann_begin > seg_ann_end){
//When annotations can't overlap anymore
break;
}
}
}
}
else{
String feature_value = new String();
if(idFeature){
feature_value = String.valueOf(seg_annotation.hashCode());
}
else if(coveredTextFeature){
feature_value = seg_annotation.getCoveredText();
}
else{
feature_value = seg_annotation.getFeatureValueAsString(feature);
}
//Replace white spaces by the feature value delimiter
ArrayList<String> splittedFeatureValueArray = new ArrayList<String>();
if(feature_value != null){
String[] splittedFeatureValue = feature_value.split(" ");
Collections.addAll(splittedFeatureValueArray, splittedFeatureValue);
}
else{
logger.severe(String.format("missing feature: %s:%s (doc: %s, <%s>)",type_name,feature_name,doc_id,seg_annotation.getCoveredText()));
}
featureInfoSet.addFeatureValues(i, splittedFeatureValueArray);
}
}
else{
featureInfoSet.addFeatureValues(i, "");
}
}
FeaturesDTO featuresdto = new FeaturesDTO(this.featureInfoSetManager.getFeaturesValues(doc_id));
try{
this.featuresDAO.insert(featuresdto);
}
catch(DAOException e){
System.err.println("DBAnnotationsCASConsumer: ERROR in CAS Consumer " +e.getClass() + " with message:" + e.getMessage());
}
}
}
/**
*
*
*/
public void collectionProcessComplete(ProcessTrace arg0) throws ResourceProcessException, IOException{
try{
this.featuresDAO.closeConnection();
}
catch(DAOException e){
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceProcessException(e);
}
System.out.println("DBAnnotationsCASConsumer: collectionProcessComplete()...");
logger.info("DBAnnotationsCASConsumer: collectionProcessComplete() - Done.");
}
}