/** * Copyright 2014 Marco Cornolti * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package it.acubelab.smaph.entityfilters; import it.acubelab.smaph.SmaphAnnotatorDebugger; import it.acubelab.smaph.learn.LibSvmFilter; import java.io.IOException; import java.util.HashMap; import java.util.Vector; import org.apache.commons.lang.ArrayUtils; /** * An SVM-based entity filter. */ public class LibSvmEntityFilter extends LibSvmFilter implements EntityFilter { public static String[] ftrNames = new String[] { "is_s1", // 1 "is_s2", "is_s3", "is_s4", "is_s5", "s1_freq", "s1_rhoScore", // "s1_localCoherence", // "s1_lp", "s1_editDistance", // 10 "s1_commonness", // "s1_avgRank", "s1_ambiguity", "s1_pageRank", // "s2_editDistanceTitle", "s2_rank", "s2_wikiWebTotal", "s2_webTotal", "s3_rank", "s3_wikiWebTotal", // 20 "s3_editDistanceTitle", "s3_editDistanceNoPar", "s3_editDistanceBolds", "s3_capitalizedBolds", "s3_avgBoldsWords", "s5_rank", "s5_wikiWebTotal", "s5_editDistanceTitle", "s5_editDistanceNoPar", "s5_editDistanceBolds", // 30 "s5_capitalizedBolds", "s5_avgBoldsWords", "s3_webTotal", "s2_editDistanceNoPar", "s2_editDistanceBolds", "s2_capitalizedBolds", "s2_avgBoldsWords", // 37 }; public LibSvmEntityFilter(String modelFileBase) throws IOException { super(modelFileBase + ".model", modelFileBase + ".range"); } @Override public boolean filterEntity(HashMap<String, Double> features) { boolean result = predict(features); String ftrDesc = ""; for (String key : features.keySet()) ftrDesc += String.format("%s:%.3f ", key, features.get(key)); SmaphAnnotatorDebugger.out.printf("EF: %s has been %s.%n", ftrDesc, result ? "accepted" : "discarded"); return result; } private static double getOrDefault(HashMap<String, Double> features, String key, double defaultVal) { Double res = features.get(key); if (res == null) return defaultVal; return res; } /** * Turns a frature_name-feature_value mapping to an array of features. * * @param features * the mapping from feature names to feature values. * @return an array of feature values. */ public static double[] featuresToFtrVectStatic( HashMap<String, Double> features) { if (!checkFeatures(features)) { for (String ftrName : features.keySet()) System.err.printf("%s -> %f%n", ftrName, features.get(ftrName)); throw new RuntimeException( "Implementation error -- check the features"); } Vector<Double> ftrValues = new Vector<>(); for (String ftrName : ftrNames) ftrValues.add(getOrDefault(features, ftrName, 0.0)); return ArrayUtils.toPrimitive(ftrValues.toArray(new Double[] {})); } private static boolean checkFeatures(HashMap<String, Double> features) { if (getOrDefault(features, "is_s1", 0.0) + getOrDefault(features, "is_s2", 0.0) + getOrDefault(features, "is_s3", 0.0) + getOrDefault(features, "is_s4", 0.0) + getOrDefault(features, "is_s5", 0.0) != 1) return false; boolean found = false; for (String sourcePrefix : new String[] { "s1_", "s2_", "s3_", "s5_" }) { int sourceFtrCount = 0; for (String ftrName : features.keySet()) if (ftrName.startsWith(sourcePrefix)) sourceFtrCount++; if (sourcePrefix.equals("s1_")) found = sourceFtrCount == 9 && features.size() == sourceFtrCount + 1; if (sourcePrefix.equals("s2_")) found = sourceFtrCount == 8 && features.size() == sourceFtrCount + 1; if (sourcePrefix.equals("s3_")) found = sourceFtrCount == 8 && features.size() == sourceFtrCount + 1; if (sourcePrefix.equals("s4_")) found = sourceFtrCount == 0 && features.size() == sourceFtrCount + 1; if (sourcePrefix.equals("s5_")) found = sourceFtrCount == 8 && features.size() == sourceFtrCount + 1; if (found) return true; } return false; } @Override public double[] featuresToFtrVect(HashMap<String, Double> features) { return featuresToFtrVectStatic(features); } }