/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.cleaning.features; import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Vector; import com.sun.tools.xjc.reader.xmlschema.bindinfo.BIConversion.Static; import edu.isi.karma.cleaning.Ruler; import edu.isi.karma.cleaning.TNode; public class RecordFeatureSet { public Collection<Feature> features; public String record; public HashSet<String> labels = new HashSet<String>(); public String[] xStrings = { "#", ";", ",", "!", "~", "@", "$", "%", "^", "&", "\\*", "\\(", "\\)", "_", "-", "\\{", "\\}", "\\[", "\\]", "\\\"", "\\\'", ":", "\\?", "<", ">", "\\.", "/", "\\\\", "\\d+", "[A-Z]+", "[a-z]+", "[\\s]" }; //public String[] xStrings = {"\\d+"}; public String[] vocabs; public String[] getLabels() { return labels.toArray(new String[labels.size()]); } public RecordFeatureSet() { } // convert the records to tokensequences and then construct the vocabulary public void initialize(Vector<String> Records) { HashSet<String> hSet = new HashSet<String>(); for (String s : Records) { Ruler r = new Ruler(); r.setNewInput(s); for (TNode t : r.vec) { if (!hSet.contains(t.text)) { hSet.add(t.text); } } } vocabs = hSet.toArray(new String[hSet.size()]); } public Collection<Feature> computeFeatures(String record, String label) { Vector<Feature> xCollection = new Vector<Feature>(); for (String c : xStrings) { Feature f = new RecordCntFeatures(c, record, c); xCollection.add(f); } for(String c:vocabs) { Feature f = new RecordTextFeature(c, record); xCollection.add(f); } if(!labels.contains(label)) { this.labels.add(label); } return xCollection; } public Collection<String> getFeatureNames() { Vector<String> x = new Vector<String>(); int cnt = 0; for (String s : xStrings) { x.add("attr_" + cnt); cnt++; } for (String s : vocabs) { x.add("attr_" + cnt); cnt++; } return x; } public static void main(String[] args) { } }