/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California. For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/
package edu.isi.karma.modeling.semantictypes.crfmodelhandler ;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Random;
import java.util.Set;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.isi.karma.modeling.semantictypes.mycrf.crfmodel.CRFModelFieldOnly;
import edu.isi.karma.modeling.semantictypes.mycrf.fieldonly.LblFtrPair;
import edu.isi.karma.modeling.semantictypes.mycrf.globaldata.GlobalDataFieldOnly;
import edu.isi.karma.modeling.semantictypes.mycrf.graph.GraphFieldOnly;
import edu.isi.karma.modeling.semantictypes.mycrf.graph.GraphInterface;
import edu.isi.karma.modeling.semantictypes.mycrf.map.MAPFieldOnly;
import edu.isi.karma.modeling.semantictypes.mycrf.math.Matrix;
import edu.isi.karma.modeling.semantictypes.mycrf.optimization.OptimizeFieldOnly;
import edu.isi.karma.modeling.semantictypes.myutils.ListOps;
import edu.isi.karma.modeling.semantictypes.myutils.Prnt;
import edu.isi.karma.modeling.semantictypes.myutils.RandOps;
import edu.isi.karma.modeling.semantictypes.sl.Lexer;
import edu.isi.karma.modeling.semantictypes.sl.Part;
import edu.isi.karma.modeling.semantictypes.sl.RegexFeatureExtractor;
/**
* This class is an API to the mycrf package.
* It allows instantiating CRF models, training them and using them for prediction.
*
* @author amangoel
*
*/
public class CRFModelHandler {
// ***********************************************************************************************
/**
* @author amangoel
* The ColumnFeature enum with members representing the possible features that could be passed.
*
*/
public enum ColumnFeature {
ColumnHeaderName ,
TableName
} ;
// ***********************************************************************************************
// ***********************************************************************************************
/**
* @author amangoel
* This internal class represents an example.
*/
static class Example {
String exampleString;
HashMap<ColumnFeature, String> columnFeatures;
/**
* @param exampleString The string that the example represents
* No ColumnFeatures specified.
*/
public Example(String exampleString) {
this.exampleString = exampleString;
columnFeatures = new HashMap<CRFModelHandler.ColumnFeature, String>();
}
/**
* @param exampleString The example string
* @param columnFeatures Associated ColumnFeatures
* It takes in a collection of feature values for each ColumnFeature,
* but only picks the first value to store in the Example,
* as I don't see yet why more than one String should be associated with a ColumnFeature.
*/
public Example(String exampleString, Map<ColumnFeature, Collection<String>> columnFeatures) {
this.exampleString = exampleString;
this.columnFeatures = new HashMap<ColumnFeature, String>();
if (columnFeatures != null) {
for(Map.Entry<ColumnFeature, Collection<String>> entry : columnFeatures.entrySet()) {
if (entry.getValue() != null && entry.getValue().size() > 0) {
String featureValue;
featureValue = null;
for(String str : entry.getValue()) {
featureValue = str;
break;
}
if (featureValue != null) {
this.columnFeatures.put(entry.getKey(), featureValue);
}
}
}
}
}
/**
* @param columnFeature A ColumnFeature
* @param featureValue Corresponding Value to the ColumnFeature
*/
public void addColumnFeature(ColumnFeature columnFeature, String featureValue) {
if (columnFeature != null && featureValue != null) {
columnFeatures.put(columnFeature, featureValue);
}
}
public String getString() {
return exampleString;
}
/**
* @param colFeature ColumnFeature for which the value is required.
* @return The value corresponding to the ColumnFeature, or null if the example doesn't have the passed ColumnFeature.
* Checking the return value from this method is therefore important.
*/
public String getValueForColumnFeature(ColumnFeature colFeature) {
if (columnFeatures.containsKey(colFeature)) {
return columnFeatures.get(colFeature);
}
else {
return null;
}
}
}
// ***********************************************************************************************
// instance variables
String file;
HashMap<String, ArrayList<Example>> labelToExamplesMap;
GlobalDataFieldOnly globalData;
ArrayList<String> allowedCharacters;
static Logger logger = LoggerFactory.getLogger(CRFModelHandler.class.getSimpleName()) ;
static final int MAX_FFs_PER_LABEL = 50;
static final int MAX_EXAMPLES_PER_LABEL = 50;
static final int MAX_EXAMPLES_SAVED_PER_LABEL = 200;
/**
* Making the empty constructor private to prevent instantiation of this class.
* This class should only be used to access its static methods.
*/
public CRFModelHandler() {
file = null ;
labelToExamplesMap = null ;
globalData = null ;
allowedCharacters = allowedCharacters();
}
/**
* Returns the path to the file that the CRF Model is using
* @return Path to the Model file
*/
public String getModelFilePath() {
return file;
}
/**
* Adds the passed list of examples to the model.
* Regenerates 100 feature functions to represent the label,
* if examples of this label already exist in the model.
*
* @param label True label for the list of example.
* @param examples List of example strings.
* @param columnFeatures Map of column features.
* @return True if success, else False
*/
public synchronized boolean addOrUpdateLabel(String label, List<String> examples, Map<ColumnFeature, Collection<String>> columnFeatures) {
ArrayList<String> cleanedExamples, allFeatures;
int labelIndex ;
HashSet<String> selectedFeatures, tmpFeatures;
ArrayList<Example> selectedExamples;
OptimizeFieldOnly optimizationObject;
boolean savingSuccessful ;
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
// running basic sanity checks in the input arguments
if (label == null || label.trim().length() == 0 || examples.size() == 0) {
Prnt.prn("@label argument cannot be null or an empty string and the @examples list cannot be empty.") ;
return false ;
}
label = label.trim() ;
cleanedExamples = new ArrayList<String>() ;
cleanedExamplesList(examples, cleanedExamples);
examples = cleanedExamples ;
// making sure that the condition where the examples list is not empty but contains junk only is not accepted
if (examples.size() == 0) {
Prnt.prn("@examples list contains forbidden characters only. The allowed characters are " + allowedCharacters) ;
return false ;
}
// if label does not already exist in the model, add new label. Also, add an entry in the map for the new label.
labelIndex = globalData.labels.indexOf(label) ;
if (labelIndex == -1) {
globalData.labels.add(label) ;
labelIndex = globalData.labels.indexOf(label) ;
labelToExamplesMap.put(label, new ArrayList<Example>()) ;
}
// adding all the new examples to list of existing examples for the arg label.
for(String newExampleString : examples) {
Example newExample = new Example(newExampleString, columnFeatures);
labelToExamplesMap.get(label).add(newExample);
}
// make sure that we consider MAX_EXAMPLES_PER_LABEL examples for training.
selectedExamples = new ArrayList<CRFModelHandler.Example>();
if (labelToExamplesMap.get(label).size() <= MAX_EXAMPLES_PER_LABEL) {
selectedExamples.addAll(labelToExamplesMap.get(label));
}
else {
RandOps.getRandomlySelectedItemsFromList(labelToExamplesMap.get(label), selectedExamples, MAX_EXAMPLES_PER_LABEL);
}
removeGraphsForLabel(labelIndex);
allFeatures = new ArrayList<String>();
tmpFeatures = new HashSet<String>();
// Add training graphs for selected examples.
// Accumulate all features for the training graphs being added.
for(Example selectedExample : selectedExamples) {
GraphFieldOnly newGraph ;
featureSet(selectedExample, tmpFeatures) ;
newGraph = new GraphFieldOnly(selectedExample.exampleString, label, new ArrayList<String>(tmpFeatures), globalData) ;
globalData.trainingGraphs.add(newGraph) ;
allFeatures.addAll(tmpFeatures) ;
}
// if the total number of features is > NUM_FFs, then randomly select NUM_FFs from them.
selectedFeatures = new HashSet<String>(allFeatures);
if (selectedFeatures.size() > MAX_FFs_PER_LABEL) {
selectFeatureSetWithWeightedProbability(allFeatures, selectedFeatures);
}
// reselect the feature functions for the labelIndex.
reselectFFs(labelIndex, selectedFeatures);
// optimize the model to adjust to the new label/examples/ffs
optimizationObject = new OptimizeFieldOnly(globalData.crfModel, globalData) ;
optimizationObject.optimize(3) ;
// save the model to file with the new weights
savingSuccessful = saveModel() ;
if (!savingSuccessful) {
file = null ;
}
return savingSuccessful ;
}
/**
* @param label The label for which examples are being requested.
* @param examples The list argument that will be used to return the list of examples in the model for the supplied label.
* @return True, if successful, else False
*/
public boolean getExamplesForLabel(String label, ArrayList<String> examples) {
ArrayList<Example> examplesOfLabel;
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
if (label == null || label.trim().length() == 0 || examples == null) {
Prnt.prn("CRFModelHandler.getExamplesForLabel: Either the label is null, or it is an empty string or examples is null") ;
return false ;
}
label = label.trim();
if (!globalData.labels.contains(label)) {
Prnt.prn("CRFModelHandler.getExamplesForLabel: Label " + label + " does not exist in the model.") ;
return false ;
}
examples.clear() ;
examplesOfLabel = labelToExamplesMap.get(label);
for(Example exampleObject : examplesOfLabel) {
examples.add(exampleObject.exampleString);
}
return true ;
}
/**
* @param labels The ordered list of labels is returned in this argument.
* @return True, if successful, else False
*/
public boolean getLabels(List<String> labels) {
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
if (labels == null) {
Prnt.prn("Invalid argument @labels. It is null.") ;
return false ;
}
labels.clear() ;
labels.addAll(globalData.labels);
return true ;
}
/**
* @param examples - list of examples of an unknown type
* @param numPredictions - required number of predictions in descending order
* @param predictedLabels - the argument in which the ordered list of labels is returned. the size of this list could be smaller than numPredictions
* if there aren't that many labels in the model already
* @param confidenceScores - the probability of the examples belonging to the labels returned.
* @param exampleProbabilities - the size() == examples.size(). It contains, for each example, in the same order, a double array that contains the probability
* of belonging to the labels returned in predictedLabels.
* @param columnFeatures - this Map supplies ColumnFeatures such as ColumnName, etc.
* @return True, if successful, else False
*/
public boolean predictLabelForExamples(
List<String> examples,
int numPredictions,
List<String> predictedLabels,
List<Double> confidenceScores,
List<double[]> exampleProbabilities,
Map<ColumnFeature, Collection<String>> columnFeatures
) {
ArrayList<ArrayList<Double>> exampleProbabilitiesFullList ;
MAPFieldOnly MAPPredictor ;
double[] columnProbabilities ;
ArrayList<String> labels ;
ArrayList<Double> columnProbabilitiesList ;
HashSet<String> features;
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
// Sanity checks for arguments
if (examples == null || examples.size() == 0 || numPredictions <= 0 || predictedLabels == null || confidenceScores == null) {
Prnt.prn("Invalid arguments. Possible problems: examples list size is zero, numPredictions is non-positive, predictedLabels or confidenceScores list is null.") ;
return false ;
}
// Making sure that there exists a model.
if(globalData.labels.size() == 0) {
Prnt.prn("The model does have not any semantic types. Please add some labels with their examples before attempting to predict using this model.") ;
return false ;
}
exampleProbabilitiesFullList = new ArrayList<ArrayList<Double>>() ;
MAPPredictor = new MAPFieldOnly(globalData) ;
columnProbabilities = new double[globalData.labels.size()] ;
features = new HashSet<String>();
// for each example, get the probability of each label.
// add the probabilities to an accumulator probabilities array
// the label that gets highest accumulated probability, is the most likely label for all examples combined
for(String example : examples) {
GraphFieldOnly exampleGraph ;
String sanitizedExample;
double[] probabilitiesForExample ;
sanitizedExample = getSanitizedString(example);
if (sanitizedExample.length() == 0) {
sanitizedExample = ".";
}
featureSet(sanitizedExample, columnFeatures, features);
exampleGraph = new GraphFieldOnly(sanitizedExample, null, new ArrayList<String>(features), globalData) ;
probabilitiesForExample = MAPPredictor.probabilitiesForLabels(exampleGraph) ;
Matrix.plusEquals(columnProbabilities, probabilitiesForExample, 1.0) ;
if (exampleProbabilities != null) {
exampleProbabilitiesFullList.add(newListFromDoubleArray(probabilitiesForExample)) ;
}
}
// the sum of all values in the probabilies array is going to be examples.size()
// normalize to get values that have a probabilistic interpretation
for(int i=0;i<globalData.labels.size();i++) {
columnProbabilities[i]/=examples.size() ;
}
// Sort both lists such that labels are listed according to their descending order of probability
// and probabilityList has the probabilities in the descending order
// The label at index i has the probability at index i
labels = new ArrayList<String>(globalData.labels) ;
columnProbabilitiesList = newListFromDoubleArray(columnProbabilities) ;
ListOps.sortListOnValues(labels, columnProbabilitiesList) ;
// Preparing to return values now
predictedLabels.clear() ;
confidenceScores.clear() ;
if (exampleProbabilities != null) {
exampleProbabilities.clear() ;
int minPreds = Math.min(numPredictions, globalData.labels.size()) ;
for(int i=0;i<examples.size();i++) {
exampleProbabilities.add(new double[minPreds]) ;
}
}
for(int index=0;index < globalData.labels.size() && index < numPredictions;index++) {
predictedLabels.add(labels.get(index)) ;
confidenceScores.add(columnProbabilitiesList.get(index)) ;
if (exampleProbabilities != null) {
int li = globalData.labels.indexOf(labels.get(index)) ;
for(int i=0;i<examples.size();i++) {
exampleProbabilities.get(i)[index] = exampleProbabilitiesFullList.get(i).get(li) ;
}
}
}
return true ;
}
/**
* @param modelFile The path of the file from which the model should be read.
* @return True is successfully read. False, otherwise.
* This function takes the path of file as input and
* creates an environment that consists of globalData, crfModel, list of examples of each label, etc.
* It reads an empty file also.
*/
public boolean readModelFromFile(String modelFile) {
BufferedReader br ;
String line ;
int numLabels ;
boolean emptyFile ;
int numFFs ;
ArrayList<LblFtrPair> ffs ;
HashSet<String> features;
double[] weights ;
CRFModelFieldOnly crfModel ;
if (modelFile == null) {
Prnt.prn("Invalid argument value. Argument @file is null.") ;
file = null ;
return false ;
}
// beginning execution
br = null ;
line = null ;
numLabels = -1 ;
try {
br = new BufferedReader(new FileReader(modelFile)) ;
emptyFile = true ;
while((line = br.readLine()) != null) {
if (line.trim().length() != 0) {
emptyFile = false ;
break ;
}
}
br.close() ;
}
catch(Exception e) {
Prnt.prn("Error reading model file " + modelFile + ".") ;
file = null ;
return false ;
}
if (emptyFile) {
globalData = new GlobalDataFieldOnly() ;
labelToExamplesMap = new HashMap<String, ArrayList<Example>>() ;
globalData.trainingGraphs = new ArrayList<GraphInterface>() ;
crfModel = new CRFModelFieldOnly(globalData) ;
crfModel.ffs = new ArrayList<LblFtrPair>() ;
crfModel.weights = new double[0] ;
globalData.crfModel = crfModel ;
file = modelFile ;
return true ;
}
else {
ArrayList<Example> selectedExamples;
features = new HashSet<String>();
globalData = new GlobalDataFieldOnly() ;
labelToExamplesMap = new HashMap<String, ArrayList<Example>>() ;
try {
br = new BufferedReader(new FileReader(modelFile)) ;
// Read the number of labels in the model file
numLabels = Integer.parseInt(br.readLine().trim()) ;
br.readLine();
// read numLabels labels and their examples
for(int labelNumber = 0 ; labelNumber < numLabels ; labelNumber++) {
String newLabel;
ArrayList<Example> examples ;
int numExamples ;
newLabel = br.readLine().trim() ;
if (globalData.labels.contains(newLabel)) {
Prnt.prn("The label " + newLabel + " was already added to the model. " +
"Later in the file, we found another list that had the same label and a set of examples underneath it. This is an error. " +
"A label can only occur one in the file. All its examples have to be listed underneath it at one place.") ;
file = null ;
br.close() ;
return false ;
}
globalData.labels.add(newLabel) ;
examples = new ArrayList<Example>() ;
numExamples = Integer.parseInt(br.readLine().trim()) ;
for(int egNumber = 0 ; egNumber < numExamples ; egNumber++) {
Example example;
example = parseExample(br);
if (example == null) {
Prnt.prn("Parsing of file failed. Could not parse an example.");
br.close();
file = null;
return false;
}
else {
examples.add(example) ;
}
}
labelToExamplesMap.put(newLabel, examples) ;
br.readLine() ; // consuming the empty line after each list of label and its examples
}
// Creating trainingGraphs for MAX_EXAMPLES_PER_LABEL num of examples
globalData.trainingGraphs = new ArrayList<GraphInterface>() ;
selectedExamples = new ArrayList<CRFModelHandler.Example>();
for(String lbl : globalData.labels) {
ArrayList<Example> allExamples;
allExamples = labelToExamplesMap.get(lbl);
selectedExamples.clear();
if (allExamples.size() <= MAX_EXAMPLES_PER_LABEL) {
selectedExamples.addAll(allExamples);
}
else {
RandOps.getRandomlySelectedItemsFromList(allExamples, selectedExamples, MAX_EXAMPLES_PER_LABEL);
}
for(Example example : selectedExamples) {
featureSet(example, features);
globalData.trainingGraphs.add(new GraphFieldOnly(example.exampleString, lbl, new ArrayList<String>(features), globalData)) ;
}
}
// starting to read in feature-functions and their weights. the first line is the number of such ffs.
numFFs = Integer.parseInt(br.readLine().trim()) ;
ffs = new ArrayList<LblFtrPair>() ;
weights = new double[numFFs] ;
for(int ffNumber = 0 ; ffNumber < numFFs ; ffNumber++) {
String[] lineParts ;
line = br.readLine().trim() ;
if (line.length() == 0) {
Prnt.prn("While reading " + numFFs + " feature functions, we encountered an empty line. This is an error. " +
"All feature functions have to be listed continuously without any blank lines in between.") ;
file = null ;
br.close() ;
return false ;
}
lineParts = line.split("\\s+") ;
ffs.add(new LblFtrPair(globalData.labels.indexOf(lineParts[0]), lineParts[1])) ;
weights[ffNumber] = Double.parseDouble(lineParts[2]) ;
}
crfModel = new CRFModelFieldOnly(globalData) ;
crfModel.ffs = ffs ;
crfModel.weights = weights ;
globalData.crfModel = crfModel ;
br.close() ;
file = modelFile ;
return true ;
}
catch(Exception e) {
Prnt.prn("Error parsing model file " + modelFile + ".") ;
file = null ;
// SHOULD I CLOSE br HERE ?
return false ;
}
}
}
/**
* @return True if successfully cleared the model. False, otherwise.
* This method removes all labels from the CRF model.
* This is effectively same as setting the model to a state,
* where an empty file has been read for the first time.
* Since, each change in the model is immediately reflected
* in the model file, this method also completely clears the
* model file.
*
*/
public boolean removeAllLabels() {
BufferedWriter bw;
CRFModelFieldOnly crfModel;
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
try {
bw = new BufferedWriter(new FileWriter(file)) ;
bw.write("") ;
bw.close() ;
}
catch(Exception e) {
Prnt.prn("Clearing the contents of the model file failed.") ;
file = null ;
return false ;
}
labelToExamplesMap = new HashMap<String, ArrayList<Example>>() ;
globalData = new GlobalDataFieldOnly() ;
globalData.trainingGraphs = new ArrayList<GraphInterface>() ;
crfModel = new CRFModelFieldOnly(globalData) ;
crfModel.ffs = new ArrayList<LblFtrPair>() ;
crfModel.weights = new double[0] ;
globalData.crfModel = crfModel ;
return true ;
}
public boolean removeLabel(String label) {
int labelIndex;
ArrayList<Double> weightsList;
ArrayList<LblFtrPair> otherFFs ;
double[] newWeights ;
OptimizeFieldOnly optimizationObject;
boolean savingSuccessful;
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
if (label == null) {
Prnt.prn("Illegal value, null, passed for argument @label") ;
return false ;
}
label = label.trim() ;
labelIndex = globalData.labels.indexOf(label) ;
if (labelIndex == -1) {
Prnt.prn("Label " + label + " does not exist in the CRF model.") ;
return false ;
}
globalData.labels.remove(labelIndex) ;
labelToExamplesMap.remove(label) ;
for(int i=0;i<globalData.trainingGraphs.size();i++) {
GraphFieldOnly graph;
graph = (GraphFieldOnly) globalData.trainingGraphs.get(i) ;
if (graph.node.labelIndex == labelIndex) {
globalData.trainingGraphs.remove(i) ;
i-- ;
}
else if(graph.node.labelIndex > labelIndex) {
graph.node.labelIndex-- ;
}
}
weightsList = new ArrayList<Double>() ;
otherFFs = new ArrayList<LblFtrPair>() ;
for(int i=0;i<globalData.crfModel.ffs.size();i++) {
if (globalData.crfModel.ffs.get(i).labelIndex != labelIndex) {
otherFFs.add(globalData.crfModel.ffs.get(i)) ;
weightsList.add(globalData.crfModel.weights[i]) ;
}
}
// Since the label has been removed from dataModel.labels
// the labels that were after this label in dataModel.labels list
// will now have their index reduced by 1.
// Therefore, all ffs that had labelIndex > the index of the label to be removed
// should have their
for(LblFtrPair ff : otherFFs) {
if (ff.labelIndex > labelIndex) {
ff.labelIndex-- ;
}
}
newWeights = new double[weightsList.size()] ;
for(int i=0;i<weightsList.size();i++) {
newWeights[i] = weightsList.get(i) ;
}
globalData.crfModel.ffs = otherFFs ;
globalData.crfModel.weights = newWeights ;
optimizationObject = new OptimizeFieldOnly(globalData.crfModel, globalData) ;
optimizationObject.optimize(10) ;
savingSuccessful = saveModel() ;
if (!savingSuccessful) {
file = null ;
}
return savingSuccessful ;
}
/**
* @return Returns list of allowed Characters
*/
private ArrayList<String> allowedCharacters() {
ArrayList<String> allowed = new ArrayList<String>() ;
// Adding A-Z
for(int c=65;c<=90;c++) {
allowed.add(new Character((char) c).toString()) ;
}
// Adding a-z
for(int c=97;c<=122;c++) {
allowed.add(new Character((char) c).toString()) ;
}
// Adding 0-9
for(int c=48;c<=57;c++) {
allowed.add(new Character((char) c).toString()) ;
}
allowed.add(" ") ; // adding space
allowed.add(".") ; // adding dot
allowed.add("%") ;
allowed.add("@") ;
allowed.add("_") ;
allowed.add("-") ;
allowed.add("*") ;
allowed.add("(") ;
allowed.add(")") ;
allowed.add("[") ;
allowed.add("]") ;
allowed.add("+") ;
allowed.add("/") ;
allowed.add("&") ;
allowed.add(":") ;
allowed.add(",") ;
allowed.add(";") ;
allowed.add("?") ;
return allowed ;
}
/**
* @param uncleanList List of all examples
* @param cleanedList List with examples that dont have unallowed chars and others such as nulls or empty strings
* This method cleans the examples list passed to it. Generally, it is used by other methods to sanitize lists passed from outside.
*/
private void cleanedExamplesList(List<String> uncleanList, List<String> cleanedList) {
cleanedList.clear();
for(String example : uncleanList) {
if (example != null) {
String trimmedExample ;
trimmedExample = getSanitizedString(example);
if (trimmedExample.length() != 0) {
cleanedList.add(trimmedExample) ;
}
}
}
}
/**
* @param columnName The value passed for the ColumnFeature ColumnHeaderName
* @param features The set in which the features extracted about this value will be returned.
*/
private void extractFeaturesFromColumnName(String columnName, HashSet<String> features) {
ArrayList<String> parts;
HashSet<String> nonDupParts;
parts = new ArrayList<String>();
nonDupParts = new HashSet<String>();
features.clear();
splitString(columnName, parts);
nonDupParts.addAll(parts);
for(String part : nonDupParts) {
part = part.trim();
if (part.length() > 0) {
features.add(part.toLowerCase()) ;
}
}
}
/**
* @param tableName The value passed for the ColumnFeature TableName
* @param features The set in which the features extracted about this value will be returned.
*/
private void extractFeaturesFromTableName(String tableName, HashSet<String> features) {
ArrayList<String> parts;
HashSet<String> nonDupParts;
parts = new ArrayList<String>();
nonDupParts = new HashSet<String>();
features.clear();
splitString(tableName, parts);
nonDupParts.addAll(parts);
for(String part : nonDupParts) {
part = part.trim();
if (part.length() > 0) {
features.add(part.toLowerCase()) ;
}
}
}
/**
* @param field A string from which syntactic features will be extracted
* @param features The arg used to return those features.
*/
private void featureSet(String field, HashSet<String> features) {
ArrayList<Part> tokens;
tokens = Lexer.tokenizeField(field);
features.clear();
for(Part token : tokens) {
features.addAll(RegexFeatureExtractor.getTokenFeatures(token)) ;
}
}
/**
* @param example The example for which the features have to extracted
* @param features The arg used to return those features.
*/
private void featureSet(Example example, HashSet<String> features) {
HashSet<String> tmpFeatures;
String featureValue;
tmpFeatures = new HashSet<String>();
features.clear();
// add features about the example string itself
featureSet(example.exampleString, tmpFeatures);
features.addAll(tmpFeatures);
// add ftrs about the example's columnname.
featureValue = example.getValueForColumnFeature(ColumnFeature.ColumnHeaderName);
if (featureValue != null) {
extractFeaturesFromColumnName(featureValue, tmpFeatures);
features.addAll(tmpFeatures);
}
// add ftrs about the example's tablename
featureValue = example.getValueForColumnFeature(ColumnFeature.TableName);
if (featureValue != null) {
extractFeaturesFromTableName(featureValue, tmpFeatures);
features.addAll(tmpFeatures);
}
}
/**
* @param field Field for which features are to be extracted
* @param columnFeatures The columnFeatures of the field.
* @param features A set used to return the features.
* This method just uses the first string in every collection to construct an Example.
* It then uses featureSet(Example, HashSet<String>) method to return the features for this created example.
*/
private void featureSet(String field, Map<ColumnFeature, Collection<String>> columnFeatures, HashSet<String> features) {
Example example;
example = new Example(field);
if (columnFeatures != null) {
for(Map.Entry<ColumnFeature, Collection<String>> entry : columnFeatures.entrySet()) {
Collection<String> ftrValues;
ftrValues = entry.getValue();
if (ftrValues != null && ftrValues.size() > 0) {
for(String ftrValue : ftrValues) {
example.addColumnFeature(entry.getKey(), ftrValue);
break;
}
}
}
}
featureSet(example, features);
}
private String getSanitizedString(String unsanitizedString) {
String sanitizedString ;
sanitizedString = "" ;
for(int i=0;i<unsanitizedString.length();i++) {
String charAtIndex;
charAtIndex = unsanitizedString.substring(i,i+1) ;
if (allowedCharacters.contains(charAtIndex)) {
sanitizedString+=charAtIndex ;
}
}
return sanitizedString;
}
/**
* @param array The array of doubles
* @return A list containing the same doubles in the same order
* A utility method to get a new list having the same values as an array
*/
private ArrayList<Double> newListFromDoubleArray(double[] array) {
ArrayList<Double> newList ;
newList = new ArrayList<Double>() ;
for(double element : array) {
newList.add(element) ;
}
return newList ;
}
/**
* @param br A BufferedReader instance
* @return Parsed Example instance.
* @throws Exception Mainly IOException
* This method starts from wherever the BufferedReader is and keeps reading till it has parsed an entire Example.
* Then it returns it.
*/
private Example parseExample(BufferedReader br) throws Exception {
Example example;
String exampleString;
int contentLen;
char c;
contentLen = parseLengthHeader(br);
if (contentLen == -1) {
Prnt.prn("Parsing of file failed since lengthHeader could not be parsed.");
return null;
}
// space has already been consumed
exampleString = "";
for(int i=0;i<contentLen;i++) {
c = (char) br.read();
exampleString = exampleString + c;
}
example = new Example(exampleString);
while (true) {
c = (char) br.read();
if (10 == (int) c) { // checking for newline character
break;
}
else if (c == ' ') {
contentLen = parseLengthHeader(br);
if (contentLen == -1) {
Prnt.prn("Parsing of file failed since lengthHeader could not be parsed.");
return null;
}
else {
String columnFeatureStringAndValue, columnFeatureString, columnFeatureValue;
ColumnFeature columnFeature;
columnFeatureStringAndValue = "";
for(int i=0;i<contentLen;i++) {
c = (char) br.read();
columnFeatureStringAndValue = columnFeatureStringAndValue + c;
}
columnFeatureString = columnFeatureStringAndValue.split(":")[0];
columnFeatureValue = columnFeatureStringAndValue.substring(columnFeatureString.length() + 1) ; // to ignore the colon
columnFeature = null;
try {
columnFeature = Enum.valueOf(ColumnFeature.class, columnFeatureString);
}
catch (Exception e) {
Prnt.prn("Parsing of file failed. There is no ColumnFeature called " + columnFeatureString + ".");
return null;
}
example.addColumnFeature(columnFeature, columnFeatureValue);
}
}
else {
Prnt.prn("Parsing of file failed because found a character other than space or newline after a column feature. The charcter is " + ((int) c));
return null;
}
}
return example;
}
/**
* @param br BufferedReader reading the model file.
* @return The int value of the string.
* @throws Exception
*/
private int parseLengthHeader(BufferedReader br) throws Exception {
String lenHeader;
char c ;
int numDigits;
numDigits = 0;
lenHeader = "";
while(true) {
c = (char)br.read();
if (c >= '0' && c<= '9') {
numDigits++;
lenHeader = lenHeader + c;
if (numDigits > 5) {
Prnt.prn("Length marker has more than 5 digits. The program doesn't expect such large entries. Signaling parsing error.");
return -1;
}
}
else if (c == ' ') {
if (lenHeader.length() > 0) {
return Integer.parseInt(lenHeader);
}
else {
return -1;
}
}
else {
return -1;
}
}
}
private void removeGraphsForLabel(int labelIndex) {
GraphFieldOnly graph;
for(int i=0;i<globalData.trainingGraphs.size();i++) {
graph = (GraphFieldOnly) globalData.trainingGraphs.get(i);
if (graph.node.labelIndex == labelIndex) {
globalData.trainingGraphs.remove(i);
i--;
}
}
}
/**
* @param labelIndex The labelIndex for which the feature functions will be reselected.
* @param newFeatureSet The new set of features to be included.
* This method takes in a set of features and a labelIndex.
* It removes all existing feature functions for this labelIndex.
* It adds new feature functions for the features supplied.
* It then sets the weights for feature functions that already existed to their old values.
* It sets the weights for all new feature functions to zero.
*/
private void reselectFFs(int labelIndex, Set<String> newFeatureSet) {
ArrayList<LblFtrPair> ffsOfLabel, otherFFs;
ArrayList<Double> weightsOfFFsOfLabel, weightsOfOtherFFs;
// separate the label ffs and weights from other ffs and weights
ffsOfLabel = new ArrayList<LblFtrPair>() ;
otherFFs = new ArrayList<LblFtrPair>() ;
weightsOfFFsOfLabel = new ArrayList<Double>() ;
weightsOfOtherFFs = new ArrayList<Double>() ;
for(int ffIndex=0;ffIndex<globalData.crfModel.ffs.size();ffIndex++) {
LblFtrPair ff;
ff = globalData.crfModel.ffs.get(ffIndex);
if (ff.labelIndex == labelIndex) {
ffsOfLabel.add(ff) ;
weightsOfFFsOfLabel.add(globalData.crfModel.weights[ffIndex]);
}
else {
otherFFs.add(ff) ;
weightsOfOtherFFs.add(globalData.crfModel.weights[ffIndex]);
}
}
// from the existing ffs of this label, if any of them have a selected feature, then add it to the other ffs and its learned weight
for(int ffIndex=0;ffIndex<ffsOfLabel.size();ffIndex++) {
LblFtrPair ff;
ff = ffsOfLabel.get(ffIndex);
if (newFeatureSet.contains(ff.feature)) {
otherFFs.add(ff);
weightsOfOtherFFs.add(weightsOfFFsOfLabel.get(ffIndex)) ;
newFeatureSet.remove(ff.feature);
}
}
// create new ffs for all other selected features and add zero as their weight
for(String ftr : newFeatureSet) {
otherFFs.add(new LblFtrPair(labelIndex, ftr));
weightsOfOtherFFs.add(0.0);
}
// reset the ffs and the weights array
globalData.crfModel.ffs = otherFFs ;
globalData.crfModel.weights = new double[otherFFs.size()];
for(int i=0;i<otherFFs.size();i++) {
globalData.crfModel.weights[i] = weightsOfOtherFFs.get(i) ;
}
}
private void selectFeatureSetWithWeightedProbability(List<String> allFeatures, Set<String> selectedFeatureSet) {
ArrayList<String> tmpAllFeatures, tmpFeatures;
Random random ;
tmpAllFeatures = new ArrayList<String>(allFeatures);
tmpFeatures = new ArrayList<String>();
random = new Random();
selectedFeatureSet.clear();
for(int i=0;i<MAX_FFs_PER_LABEL;i++) {
String ftr;
ftr = tmpAllFeatures.get(random.nextInt(tmpAllFeatures.size()));
selectedFeatureSet.add(ftr);
tmpFeatures.clear();
tmpFeatures.add(ftr);
tmpAllFeatures.removeAll(tmpFeatures);
}
}
/**
* This method writes the model in memory to the file that it was read from.
* @return true, if writing is successful, else return, false
*/
private boolean saveModel() {
try {
BufferedWriter bw;
bw = new BufferedWriter(new FileWriter(file)) ;
// Write the number of labels and then a blank line
bw.write(globalData.labels.size() + "\n") ;
// Insert an empty line
bw.write("\n");
// Write name of label and then list its examples.
for(String label : globalData.labels) {
ArrayList<Example> examples;
bw.write(label + "\n") ;
examples = labelToExamplesMap.get(label) ;
// Get random MAX_EXAMPLES_SAVED_PER_LABEL number of examples to be saved
if (examples.size() > MAX_EXAMPLES_SAVED_PER_LABEL) {
Collections.shuffle(examples);
ArrayList<Example> subsetOfExamples = new ArrayList<Example>();
for (int i=0; i< MAX_EXAMPLES_SAVED_PER_LABEL; i++) {
subsetOfExamples.add(examples.get(i));
}
examples = subsetOfExamples;
}
bw.write(examples.size() + "\n") ;
for(Example example : examples) {
bw.write(example.exampleString.length() + " " + example.exampleString) ;
for(Map.Entry<ColumnFeature, String> entry : example.columnFeatures.entrySet()) {
if (entry.getValue() != null) {
String featureValue;
featureValue = entry.getKey().toString() + ":" + entry.getValue();
bw.write(" " + featureValue.length() + " " + featureValue);
}
}
bw.write("\n");
}
bw.write("\n") ;
}
// write all the feature functions
bw.write(globalData.crfModel.ffs.size() + "\n") ;
for(int ffIndex = 0;ffIndex<globalData.crfModel.ffs.size();ffIndex++) {
LblFtrPair ff;
ff = globalData.crfModel.ffs.get(ffIndex) ;
bw.write(globalData.labels.get(ff.labelIndex) + " " + ff.feature + " " + globalData.crfModel.weights[ffIndex] + "\n") ;
}
bw.close() ;
return true ;
}
catch(Exception e) {
Prnt.prn("Writing the model to file " + file + " failed. The file can be inconsistent with the model in memory until it is successfully written.") ;
return false ;
}
}
/**
* @param str The string to be split
* @param parts The list in which the parts will be returned
* @return True, if successful. False, if errors like null args.
*/
private boolean splitString(String str, ArrayList<String> parts) {
HashSet<String> splitters;
ArrayList<String> tmpParts;
// basic argument sanity check
if (str == null || parts == null) {
return false;
}
// creating the preset splitters
splitters = new HashSet<String>();
splitters.add("\\s+");
splitters.add("_");
tmpParts = new ArrayList<String>();
// setting up the arraylist for iterative processing
parts.clear();
parts.add(str);
// iterate over all splitters
for(String splitter : splitters) {
tmpParts.clear();
for(String part : parts) {
String[] tokens;
tokens = part.split(splitter);
for(String token : tokens) {
if (token.length() != 0) {
tmpParts.add(token);
}
}
}
parts.clear();
parts.addAll(tmpParts);
}
return true;
}
} // end of class CRFModelHandlerNew
/*
public static boolean getWeightedFeatureFunctionSums(String example, Map<ColumnFeature, Collection<String>> columnFeatures, List<Double> sums) {
GraphFieldOnly exampleGraph ;
HashSet<String> features;
double[] ffSums;
MAPFieldOnly mapPredictor;
features = new HashSet<String>();
featureSet(example, columnFeatures, features);
exampleGraph = new GraphFieldOnly(example, null, new ArrayList<String>(features), globalData) ;
mapPredictor = new MAPFieldOnly(globalData);
ffSums = mapPredictor.weightedFeatureFunctionSums(exampleGraph);
sums.clear();
for(double sum : ffSums) {
sums.add(sum);
}
return true;
}
private static boolean addOrUpdateLabel(String label, List<String> examples) {
if (file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
else {
return addOrUpdateLabel(label, examples, null) ;
}
}
* @param examples - list of examples of an unknown type
* @param numPredictions - required number of predictions in descending order
* @param predictedLabels - the argument in which the ordered list of labels is returned. the size of this list could be smaller than numPredictions
* if there aren't that many labels in the model already
* @param confidenceScores - the probability of the examples belonging to the labels returned.
* @return
private static boolean predictLabelForExamples(
List<String> examples,
int numPredictions,
List<String> predictedLabels,
List<Double> confidenceScores
) {
if (CRFModelHandler.file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
else {
return predictLabelForExamples(examples, numPredictions, predictedLabels, confidenceScores, null) ;
}
}
* @param examples - list of examples of an unknown type
* @param numPredictions - required number of predictions in descending order
* @param predictedLabels - the argument in which the ordered list of labels is returned. the size of this list could be smaller than numPredictions
* if there aren't that many labels in the model already
* @param confidenceScores - the probability of the examples belonging to the labels returned.
* @param exampleProbabilities - the size() == examples.size(). It contains, for each example, in the same order, a double array that contains the probability
* of belonging to the labels returned in predictedLabels.
* @return
private static boolean predictLabelForExamples(
List<String> examples,
int numPredictions,
List<String> predictedLabels,
List<Double> confidenceScores,
List<double[]> exampleProbabilities
) {
if (CRFModelHandler.file == null) {
Prnt.prn("CRF Model is not ready, either because it was never read or an error happened while reading it previously. Please try reading the model file again.");
return false ;
}
else {
return predictLabelForExamples(examples, numPredictions, predictedLabels, confidenceScores, exampleProbabilities, null) ;
}
}
*/