/* * Copyright 2012 FundaciĆ³ Barcelona Media * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.barcelonamedia.uima.consumer.features; import java.util.ArrayList; import org.barcelonamedia.uima.consumer.utils.CollectionUtils; public class FeatureInfoSetManager{ /** * Delimiter for features definition for specifying which feature value * will be used for filtering the feature to be extracted. */ private static final String FILTERING_FEATURE_DELIMITER = "#"; /** * Delimiter used for the different values of a given feature. */ private static final String DEFAULT_FEATURES_CONCAT_CHAR = " "; /** * Delimiter used to replace white spaces into the value of a feature. */ private static final String DEFAULT_FEATURES_WHITESPACE_CHAR = "_"; /** * Delimiter used in the feature qualified name (Type:feature) of the features to be extracted. */ private static final String TYPESYSTEM_FEATURES_DELIMITER = ":"; private ArrayList<FeatureInfoSet> featureInfoSetList = new ArrayList<FeatureInfoSet>(); /** * FEATURES configuration parameter value */ private String[] features; /** * FEATURES_DISPLAY_NAMES configuration parameter value */ private String[] featuresColumnNames; /** * FEATURES_CONCAT_CHARS configuration parameter value */ private String[] featuresConcatChars; /** * FEATURES_WHITESPACE_CHARS configuration parameter value */ private String[] featuresWhitespacesChars; public FeatureInfoSetManager() { } public FeatureInfoSetManager(String[] features, String[] features_column_names, String[] features_concat_chars, String[] features_whitespaces_chars){ this.features = features; this.featuresColumnNames = features_column_names; this.featuresConcatChars = features_concat_chars; this.featuresWhitespacesChars = features_whitespaces_chars; } public ArrayList<FeatureInfoSet> getFeatureInfoSetList(){ return featureInfoSetList; } public void setFeatures(String[] features){ this.features = features; } public void setFeaturesColumnNames(String[] features_column_names){ this.featuresColumnNames = features_column_names; } public void setFeaturesConcatChars(String[] features_concat_chars){ this.featuresConcatChars = features_concat_chars; } public void setFeaturesWhitespacesChars(String[] features_whitespaces_chars){ this.featuresWhitespacesChars = features_whitespaces_chars; } public void buildFeaturesInfoSet(){ for(int index=0; index<this.features.length; index++){ FeatureInfoSet featureInfoSet = new FeatureInfoSet(); featureInfoSet.setDatabaseColumnName(this.featuresColumnNames[index]); //Retrieves feature to be extracted and filtering feature int filtering_feature_index = this.features[index].indexOf(FILTERING_FEATURE_DELIMITER); if(filtering_feature_index > 0){ String feature_qualified_name = this.features[index].substring(0, filtering_feature_index); featureInfoSet.setQualifiedName(feature_qualified_name); String[] ts_f = feature_qualified_name.split(TYPESYSTEM_FEATURES_DELIMITER); String type_name = ts_f[0]; String feature_name = ts_f[1]; featureInfoSet.setType(type_name); featureInfoSet.setName(feature_name); String filtering_feature_value = this.features[index].substring((filtering_feature_index + 1), this.features[index].length()); String filtering_fname = filtering_feature_value.substring(0, filtering_feature_value.indexOf("=")); String filtering_fvalue = filtering_feature_value.substring(filtering_feature_value.indexOf("=") + 1, filtering_feature_value.length()); FilteringFeatureInfoSet filteringFeatureInfoSet = new FilteringFeatureInfoSet(); filteringFeatureInfoSet.setQualifiedName(type_name + TYPESYSTEM_FEATURES_DELIMITER + filtering_fname); filteringFeatureInfoSet.setRegex(filtering_fvalue); featureInfoSet.setFilteringFeatureInfoSet(filteringFeatureInfoSet); } else{ String feature_qualified_name = this.features[index]; featureInfoSet.setQualifiedName(feature_qualified_name); String[] ts_f = feature_qualified_name.split(TYPESYSTEM_FEATURES_DELIMITER); String type_name = ts_f[0]; String feature_name = ts_f[1]; featureInfoSet.setType(type_name); featureInfoSet.setName(feature_name); } if(this.featuresConcatChars != null){ featureInfoSet.setConcatChar(this.featuresConcatChars[index]); } else{ featureInfoSet.setConcatChar(DEFAULT_FEATURES_CONCAT_CHAR); } if(this.featuresWhitespacesChars != null){ featureInfoSet.setWhitespaceFillingChar(this.featuresWhitespacesChars[index]); } else{ featureInfoSet.setWhitespaceFillingChar(DEFAULT_FEATURES_WHITESPACE_CHAR); } this.featureInfoSetList.add(featureInfoSet); } } public String getFeaturesDatabaseColumns(){ ArrayList<String> databaseColumnNames = new ArrayList<String>(); for(FeatureInfoSet featureInfoSet : this.featureInfoSetList){ if(!databaseColumnNames.contains(featureInfoSet.getDatabaseColumnName())){ databaseColumnNames.add(featureInfoSet.getDatabaseColumnName()); } } return CollectionUtils.join(databaseColumnNames, ","); } private int getNumberOfDatabaseColumns(){ ArrayList<String> databaseColumnNames = new ArrayList<String>(); int numberOfColumns = 0; for(FeatureInfoSet featureInfoSet : this.featureInfoSetList){ if(!databaseColumnNames.contains(featureInfoSet.getDatabaseColumnName())){ numberOfColumns += 1; databaseColumnNames.add(featureInfoSet.getDatabaseColumnName()); } } return numberOfColumns; } public String[] getFeaturesValues(String doc_id){ String[] featureValues = new String[this.getNumberOfDatabaseColumns() + 1]; featureValues[0] = doc_id; int index = 1; ArrayList<String> processedDatabaseColumnNames = new ArrayList<String>(); for(FeatureInfoSet featureInfoSet : this.featureInfoSetList){ if(!processedDatabaseColumnNames.contains(featureInfoSet.getDatabaseColumnName())){ featureValues[index] = featureInfoSet.getValues(); processedDatabaseColumnNames.add(featureInfoSet.getDatabaseColumnName()); index++; } else{ //Stores in the same column index of the database table int i = processedDatabaseColumnNames.indexOf(featureInfoSet.getDatabaseColumnName()); //Normalizes index, as "doc_id" is in column 0 i+= 1; featureValues[i] = featureValues[i] + DEFAULT_FEATURES_CONCAT_CHAR + featureInfoSet.getValues(); } featureInfoSet.resetValues(); } return featureValues; } }