/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.modeling.semantictypes.sl ; import java.util.ArrayList; import edu.isi.karma.modeling.semantictypes.myutils.Prnt; /** * This class generates features for tokens. * Some of the features are: * StartsWith, NumDigits, Symbol<X> * * @author amangoel * */ public class RegexFeatureExtractor { public static ArrayList<String> getFieldFeatures(String field) { ArrayList<String> feature_list = new ArrayList<String>() ; if(field == null) return feature_list ; field.trim() ; if(field.equals("")) return feature_list ; ArrayList<Part> parts = Lexer.tokenizeField(field) ; if(parts.size() == 1) { feature_list.add(Feature.single_token_field) ; return feature_list ; } else { feature_list.add(Feature.starts_with_token_ + parts.get(0).string) ; feature_list.add(Feature.ends_with_token_ + parts.get(parts.size()-1).string) ; return feature_list ; } } public static ArrayList<String> getTokenFeatures(Part part) { ArrayList<String> feature_list = new ArrayList<String>() ; String token = part.string ; if(part.type == Type.pure_alpha) { int len = token.length() ; feature_list.add(Feature.alpha_length_ + len) ; String first_character = token.substring(0,1) ; feature_list.add(Feature.starts_with_char_ + first_character) ; boolean all_caps = true ; for(int i=0; i<token.length() ; i++) { if(part.string.charAt(i) >= 'A' && part.string.charAt(i) <= 'Z') continue ; else { all_caps = false ; break ; } } if(all_caps) feature_list.add(Feature.all_capitalized_token) ; else if(first_character.charAt(0) >= 'A' && first_character.charAt(0) <= 'Z') feature_list.add(Feature.capitalized_token) ; feature_list.add(Feature.alpha_id_ + token) ; } else if(part.type == Type.number) { if(token.substring(0,1) == "-") { token=token.substring(1) ; feature_list.add(Feature.neg_num) ; } String first_part = "" ; String decimal_part = "" ; int decimal_index = token.indexOf(".") ; if(decimal_index >= 0) { first_part = token.substring(0,decimal_index) ; decimal_part = token.substring(decimal_index+1) ; } else { first_part = token ; decimal_part = "" ; } feature_list.add(Feature.num_len_ + token.length()) ; feature_list.add(Feature.before_decimal_len_ + first_part.length()) ; feature_list.add(Feature.after_decimal_len_ + decimal_part.length()) ; if(!first_part.equals("")) feature_list.add(Feature.starting_digit_ + first_part.substring(0,1)) ; if(!first_part.equals("")) feature_list.add(Feature.unit_place_digit_ + first_part.substring(first_part.length()-1)) ; if(!decimal_part.equals("")) feature_list.add(Feature.tenth_place_digit_ + decimal_part.substring(0,1)) ; } else if(part.type == Type.symbol) { feature_list.add(Feature.symbol_ + token) ; } else { Prnt.endIt("RegexFeatureExtract.getTokenFeatures: type of part not found to be any of alpha, num, sym. \nEnding.") ; } return feature_list ; } }