/******************************************************************************* * Copyright 2012 University of Southern California * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * * This code was developed by the Information Integration Group as part * of the Karma project at the Information Sciences Institute of the * University of Southern California. For more information, publications, * and related projects, please see: http://www.isi.edu/integration ******************************************************************************/ package edu.isi.karma.modeling.semantictypes.sl ; import java.util.ArrayList; import java.util.regex.Matcher; import java.util.regex.Pattern; import edu.isi.karma.modeling.semantictypes.myutils.Prnt; /** * This class is used to split field into tokens. * * @author amangoel * */ public class Lexer { // enumerated types used to remember what we the lexer has been looking/searching for while scanning the char stream //static enum Searching {ALPHA, NUM, PUNC} ; // This func is the main lexer // it makes a field as input and returns its tokens/components as a list // Basically it uses regex to find the patterns for alpha words, numbers and symbols // Assumption : Number can be 23, 23.45, .45, 0.45 , 23.0 , 2,345,678.350 but cannot be 23. (that is a decimal but no digits after it) // All of the above can have a negative sign in front public static ArrayList<Part> tokenizeField(String field) { ArrayList<Part> part_list = new ArrayList<Part>() ; String tmp_field = "" ; for(int i=0;i<field.length();i++) { char c = field.charAt(i) ; if((int) c == 160) c = ' ' ; tmp_field+=c ; } field = tmp_field ; field = field.trim() ; field = field.replaceAll("^", "") ; if(field.equals("") || field == null) { Prnt.prn("Lexer got empty string or null string in tokenizeField") ; return part_list ; } if(field.equals("NULLNULL")) { part_list.add(new Part("NULLNULL", Type.NULLNULL)); return part_list ; } Pattern pure_alpha = Pattern.compile("[a-z|A-Z]+") ; Pattern pure_symbol = Pattern.compile("\\W|\\_"); // For considering underscore as symbol Pattern number = Pattern.compile("((\\-)?[0-9]{1,3}(,[0-9]{3})+(\\.[0-9]+)?)|((\\-)?[0-9]*\\.[0-9]+)|((\\-)?[0-9]+)") ; Matcher matcher ; int start_index=0, end_index=0 ; while(true) { //Util.prn(field) ; matcher = pure_alpha.matcher(field) ; // for pure alpha if(matcher.find() && (start_index = matcher.start()) == 0) { end_index = matcher.end() ; part_list.add(new Part(field.substring(start_index, end_index), Type.pure_alpha)) ; field = field.substring(end_index).trim() ; if(field.equals("")) break ; else continue ; } matcher = number.matcher(field) ; // for pure number if(matcher.find() && (start_index = matcher.start()) == 0) { end_index = matcher.end() ; part_list.add(new Part(field.substring(start_index, end_index), Type.number)) ; field = field.substring(end_index).trim() ; if(field.equals("")) break ; else continue ; } matcher = pure_symbol.matcher(field) ; // for symbol if(matcher.find() && (start_index = matcher.start()) == 0) { end_index = matcher.end() ; part_list.add(new Part(field.substring(start_index, end_index), Type.symbol)) ; field = field.substring(end_index).trim() ; if(field.equals("")) break ; else continue ; } Prnt.endIt("Can't tokenize since field part not matching either alpha or numeric or symbol") ; } return part_list ; } /* // This func is the main lexer // it makes a field as input and returns its tokens/components as a list // Basically it scans through the string separating out pure alphas, nums and other characters (continuous symbols are treated separate) // This function separates out pure alphabets, pure nums and pure symbols // // static ArrayList<String> tokenizeElement(String element) { ArrayList<String> tokenslist = new ArrayList<String>() ; String str = element.trim() ; if(str.equals("") || str == null) { Util.prn("Lexer got empty string or null string in tokenizeElement") ; System.exit(1) ; } if(str.equals("NULLNULL")) { tokenslist.add("NULLNULL"); return tokenslist ; } Searching state = Searching.PUNC; String token = "" ; for(int j=0;j<str.length();j++) { char c = str.charAt(j) ; if(state == Searching.ALPHA) { if(isAlpha(c)) token+=c ; else { tokenslist.add(token) ; if(isNum(c)) { token = String.valueOf(c) ; state = Searching.NUM ; } else { if(!isSpace(c)) { tokenslist.add(String.valueOf(c)) ; } state = Searching.PUNC ; } } } else if(state == Searching.NUM) { if(isNum(c)) token+=c ; else { tokenslist.add(token) ; if(isAlpha(c)) { token = String.valueOf(c); state = Searching.ALPHA ; } else { if(!isSpace(c)) { tokenslist.add(String.valueOf(c)) ; } state = Searching.PUNC ; } } } else { if(isAlpha(c)) { token=String.valueOf(c) ; state = Searching.ALPHA ; } else if(isNum(c)) { token = String.valueOf(c) ; state = Searching.NUM ; } else { if(!isSpace(c)) { tokenslist.add(String.valueOf(c)) ; } } } } if(state == Searching.ALPHA || state == Searching.NUM) { tokenslist.add(token) ; } return tokenslist ; } */ // this func tells if the given char is an alphabet (caps or lowercase) static boolean isAlpha(char c) { if((c >= 'A' && c <= 'Z') || (c>='a' && c<='z')) return true ; else return false ; } // this func tells if the given char is numeric (0-9) static boolean isNum(char c) { if(c >='0' && c<='9') return true ; else return false ; } // this func tells if the given char is a space static boolean isSpace(char c) { int i = (int) c ; if(i == 32 || i==160) // 160 is also a space. It is interpreted as hard space on web pages to force a space. return true ; else return false ; } /* * old tokenize * * static void tokenize (ArrayList<String> listofelements, Table table, ArrayList<String> tokenslist, ArrayList<String> listoftypelabels) throws Exception { tokenslist.clear() ; listoftypelabels.clear() ; ArrayList<String> typelabelsoftable = null ; if(table != null && table.typelabels != null) { typelabelsoftable = table.typelabels ; if(typelabelsoftable.size() != listofelements.size()) { Util.prn("The size of typelabelsoftable doesn't match the size of listofelements in Lexer.tokenize") ; System.exit(1) ; } } for(int i=0;i<listofelements.size();i++) { } } */ }