package; import; import; import; import java.util.ArrayList; import java.util.SortedMap; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; import org.maltparser.core.exception.MaltChainedException; import org.maltparser.core.feature.FeatureException; import org.maltparser.core.feature.FeatureVector; import org.maltparser.core.feature.function.FeatureFunction; import org.maltparser.core.feature.function.Modifiable; import org.maltparser.core.feature.value.SingleFeatureValue; import org.maltparser.core.syntaxgraph.DependencyStructure; import; import; import; import org.maltparser.parser.history.action.SingleDecision; /** * The feature divide model is used for divide the training instances into * several models according to a divide feature. Usually this strategy decrease * the training and classification time, but can also decrease the accuracy of * the parser. * * @author Johan Hall * @since 1.0 */ public class FeatureDivideModel implements InstanceModel { private Model parent; private final SortedMap<Integer, AtomicModel> divideModels; private FeatureVector masterFeatureVector; private FeatureVector divideFeatureVector; private int frequency = 0; private FeatureFunction divideFeature; private int divideThreshold; private AtomicModel masterModel; private ArrayList<Integer> divideFeatureIndexVector; /** * Constructs a feature divide model. * * @param features the feature vector used by the atomic model. * @param parent the parent guide model. * @throws MaltChainedException */ public FeatureDivideModel(FeatureVector features, Model parent) throws MaltChainedException { setParent(parent); setFrequency(0); initSplitParam(features); divideModels = new TreeMap<Integer, AtomicModel>(); if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) { masterModel = new AtomicModel(-1, masterFeatureVector, this); } else if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) { load(); } } public void addInstance(SingleDecision decision) throws MaltChainedException { if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.CLASSIFY) { throw new GuideException("Can only add instance during learning. "); } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { throw new GuideException("The divide feature does not have a single value. "); } divideFeature.update(); if (divideModels != null) { if (!divideModels.containsKey(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode())) { divideModels.put(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode(), new AtomicModel(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode(), divideFeatureVector, this)); } divideModels.get(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode()).addInstance(decision); } else { throw new GuideException("The feature divide models cannot be found. "); } } public void noMoreInstances() throws MaltChainedException { // if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) { // throw new GuideException("Can only finish all data during learning. "); // } if (divideModels != null) { // divideFeature.updateCardinality(); for (Integer index : divideModels.keySet()) { divideModels.get(index).noMoreInstances(); } final TreeSet<Integer> removeSet = new TreeSet<Integer>(); for (Integer index : divideModels.keySet()) { if (divideModels.get(index).getFrequency() <= divideThreshold) { divideModels.get(index).moveAllInstances(masterModel, divideFeature, divideFeatureIndexVector); removeSet.add(index); } } for (Integer index : removeSet) { divideModels.remove(index); } masterModel.noMoreInstances(); } else { throw new GuideException("The feature divide models cannot be found. "); } } public void finalizeSentence(DependencyStructure dependencyGraph) throws MaltChainedException { // if (getGuide().getGuideMode() == Guide.GuideMode.CLASSIFY) { // throw new GuideException("Can only finish sentence during learning. "); // } if (divideModels != null) { for (AtomicModel divideModel : divideModels.values()) { divideModel.finalizeSentence(dependencyGraph); } } else { throw new GuideException("The feature divide models cannot be found. "); } } public boolean predict(SingleDecision decision) throws MaltChainedException { if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) { throw new GuideException("Can only predict during parsing. "); } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { throw new GuideException("The divide feature does not have a single value. "); } //divideFeature.update(); if (divideModels != null && divideModels.containsKey(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode())) { return divideModels.get(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode()).predict(decision); } else if (masterModel != null && masterModel.getFrequency() > 0) { return masterModel.predict(decision); } else { getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " + "no divide or master model that covers the divide value '" + ((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode() + "', as default" + " class code '1' is used. "); decision.addDecision(1); // default prediction //classCodeTable.getEmptyKBestList().addKBestItem(1); } return true; } public FeatureVector predictExtract(SingleDecision decision) throws MaltChainedException { return getAtomicModel().predictExtract(decision); } public FeatureVector extract() throws MaltChainedException { return getAtomicModel().extract(); } private AtomicModel getAtomicModel() throws MaltChainedException { if (getGuide().getGuideMode() == ClassifierGuide.GuideMode.BATCH) { throw new GuideException("Can only predict during parsing. "); } else if (!(divideFeature.getFeatureValue() instanceof SingleFeatureValue)) { throw new GuideException("The divide feature does not have a single value. "); } if (divideModels != null && divideModels.containsKey(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode())) { return divideModels.get(((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode()); } else if (masterModel != null && masterModel.getFrequency() > 0) { return masterModel; } else { getGuide().getConfiguration().getConfigLogger().info("Could not predict the next parser decision because there is " + "no divide or master model that covers the divide value '" + ((SingleFeatureValue) divideFeature.getFeatureValue()).getIndexCode() + "', as default" + " class code '1' is used. "); } return null; } public void terminate() throws MaltChainedException { if (divideModels != null) { for (AtomicModel divideModel : divideModels.values()) { divideModel.terminate(); } } if (masterModel != null) { masterModel.terminate(); } } public void train() throws MaltChainedException { for (AtomicModel divideModel : divideModels.values()) { divideModel.train(); } masterModel.train(); save(); for (AtomicModel divideModel : divideModels.values()) { divideModel.terminate(); } masterModel.terminate(); } /** * Initialize the feature split parameters and the split feature vector and * master feature vector according to the behavior strategy. * * @param featureVector the parent guide model's feature vector. * @throws MaltChainedException */ protected void initSplitParam(FeatureVector featureVector) throws MaltChainedException { if (getGuide().getConfiguration().getOptionValue("guide", "data_split_column") == null || getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().length() == 0) { throw new GuideException("The option '--guide-data_split_column' cannot be found, when initializing the data split. "); } if (getGuide().getConfiguration().getOptionValue("guide", "data_split_structure") == null || getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().length() == 0) { throw new GuideException("The option '--guide-data_split_structure' cannot be found, when initializing the data split. "); } try { final String spec = "InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString().trim() + ", " + getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString().trim() + ")"; divideFeature = featureVector.getFeatureModel().identifyFeature(spec); } catch (FeatureException e) { throw new GuideException("The data split feature 'InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString() + ", " + getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString() + ") cannot be initialized. ", e); } if (!(divideFeature instanceof Modifiable)) { throw new GuideException("The data split feature 'InputColumn(" + getGuide().getConfiguration().getOptionValue("guide", "data_split_column").toString() + ", " + getGuide().getConfiguration().getOptionValue("guide", "data_split_structure").toString() + ") does not implement Modifiable interface. "); } divideFeatureIndexVector = new ArrayList<Integer>(); for (int i = 0; i < featureVector.size(); i++) { if (featureVector.get(i).equals(divideFeature)) { divideFeatureIndexVector.add(i); } } // if ((Boolean)getGuide().getConfiguration().getOptionValue("malt0.4", "behavior") == true) { // /* MaltParser 0.4 removes the divide feature for all divide models. For the "Sum-up" model or // * master model adds the divide feature in the end of the feature vector. // */ // masterFeatureVector = (FeatureVector)featureVector.clone(); // for (Integer i : divideFeatureIndexVector) { // masterFeatureVector.remove(masterFeatureVector.get(i)); // } // for (Integer i : divideFeatureIndexVector) { // masterFeatureVector.add(featureVector.get(i)); // } // // divideFeatureVector = (FeatureVector)featureVector.clone(); // for (Integer i : divideFeatureIndexVector) { // divideFeatureVector.remove(divideFeatureVector.get(i)); // } // } else { masterFeatureVector = featureVector; divideFeatureVector = (FeatureVector) featureVector.clone(); for (Integer i : divideFeatureIndexVector) { divideFeatureVector.remove(divideFeatureVector.get(i)); } // } try { if (getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString() != null) { divideThreshold = Integer.parseInt(getGuide().getConfiguration().getOptionValue("guide", "data_split_threshold").toString()); } else { divideThreshold = 0; } } catch (NumberFormatException e) { throw new GuideException("The --guide-data_split_threshold option is not an integer value. ", e); } } /** * Saves the feature divide model settings .fsm file. * * @throws MaltChainedException */ protected void save() throws MaltChainedException { try { final BufferedWriter out = new BufferedWriter(getGuide().getConfiguration().getConfigurationDir().getOutputStreamWriter(getModelName() + ".dsm")); out.write(masterModel.getIndex() + "\t" + masterModel.getFrequency() + "\n"); if (divideModels != null) { for (AtomicModel divideModel : divideModels.values()) { out.write(divideModel.getIndex() + "\t" + divideModel.getFrequency() + "\n"); } } out.close(); } catch (IOException e) { throw new GuideException("Could not write to the guide model settings file '" + getModelName() + ".dsm" + "', when " + "saving the guide model settings to file. ", e); } } /** * Loads the feature divide model settings .fsm file. * * @throws MaltChainedException */ protected void load() throws MaltChainedException { try { final BufferedReader in = new BufferedReader(getGuide().getConfiguration().getConfigurationDir().getInputStreamReaderFromConfigFile(getModelName() + ".dsm")); final Pattern tabPattern = Pattern.compile("\t"); while (true) { String line = in.readLine(); if (line == null) { break; } String[] cols = tabPattern.split(line); if (cols.length != 2) { throw new GuideException(""); } int code = -1; int freq = 0; try { code = Integer.parseInt(cols[0]); freq = Integer.parseInt(cols[1]); } catch (NumberFormatException e) { throw new GuideException("Could not convert a string value into an integer value when loading the feature divide model settings (.fsm). ", e); } if (code == -1) { masterModel = new AtomicModel(-1, masterFeatureVector, this); masterModel.setFrequency(freq); } else if (divideModels != null) { divideModels.put(code, new AtomicModel(code, divideFeatureVector, this)); divideModels.get(code).setFrequency(freq); } setFrequency(getFrequency() + freq); } in.close(); } catch (IOException e) { throw new GuideException("Could not read from the guide model settings file '" + getModelName() + ".dsm" + "', when " + "loading the guide model settings. ", e); } } /** * Returns the parent model * * @return the parent model */ public Model getParent() { return parent; } public ClassifierGuide getGuide() { return parent.getGuide(); } /** * Sets the parent model * * @param parent the parent model */ protected void setParent(Model parent) throws MaltChainedException { this.parent = parent; } public String getModelName() throws MaltChainedException { try { return parent.getModelName(); } catch (NullPointerException e) { throw new GuideException("The parent guide model cannot be found. ", e); } } /** * Returns the "sum-up" or master feature vector * * @return a feature vector object */ public FeatureVector getMasterFeatureVector() { return masterFeatureVector; } /** * Returns the divide feature vector * * @return a feature vector object */ public FeatureVector getDivideFeatureVector() { return divideFeatureVector; } /** * Returns the frequency (number of instances) * * @return the frequency (number of instances) */ public int getFrequency() { return frequency; } /** * Increase the frequency by 1 */ public void increaseFrequency() { if (parent instanceof InstanceModel) { ((InstanceModel) parent).increaseFrequency(); } frequency++; } public void decreaseFrequency() { if (parent instanceof InstanceModel) { ((InstanceModel) parent).decreaseFrequency(); } frequency--; } /** * Sets the frequency (number of instances) * * @param frequency (number of instances) */ protected void setFrequency(int frequency) { this.frequency = frequency; } /* * (non-Javadoc) @see java.lang.Object#toString() */ @Override public String toString() { final StringBuilder sb = new StringBuilder(); //TODO return sb.toString(); } }