package org.sakaiproject.citation.util.impl; import java.util.HashSet; import java.util.Set; import java.util.StringTokenizer; public class InputStringParser implements org.sakaiproject.citation.util.api.InputStringParser { private static final java.util.Set<String> COMMON_WORDS = new java.util.HashSet<String>(); private static final String DOUBLE_QUOTE = "\""; // the parser switches between these two sets of delimiters private static final String WHITESPACE_AND_QUOTES = " \t\r\n\""; private static final String QUOTES_ONLY ="\""; // Common words against which searches will not be performed. static { COMMON_WORDS.add("a"); COMMON_WORDS.add("and"); COMMON_WORDS.add("be"); COMMON_WORDS.add("for"); COMMON_WORDS.add("from"); COMMON_WORDS.add("has"); COMMON_WORDS.add("i"); COMMON_WORDS.add("in"); COMMON_WORDS.add("is"); COMMON_WORDS.add("it"); COMMON_WORDS.add("of"); COMMON_WORDS.add("on"); COMMON_WORDS.add("to"); COMMON_WORDS.add("the"); COMMON_WORDS.add("not"); COMMON_WORDS.add("or"); } /** * Parse keywords into a Set of Strings. This method recognizes phrases * (marked using quotation marks) and drops common words (i.e. and, or, not, * to, the, etc.) if they are not part of a phrase. Each element of the * resulting Set will be a single term or a phrase. * * @param inputString the input a user has submitted (i.e. from an HTML * input field in a form) * @return a Set containing individual search terms or phrases or null if * inputString is null or empty. */ public Set<String> parseInputString( String inputString ) { if( inputString == null || inputString.trim().equals( "" ) ) { return null; } Set<String> result = new HashSet<String>(); boolean returnTokens = true; String currentDelim = WHITESPACE_AND_QUOTES; StringTokenizer parser = new StringTokenizer( inputString, currentDelim, returnTokens ); String token = null; while ( parser.hasMoreTokens() ) { token = parser.nextToken( currentDelim ); if ( !isDoubleQuote(token) ){ addNonTrivialWordToResult( token, result ); } else { currentDelim = switchDelimiters( currentDelim ); } } return result; } private static boolean isCommonWord( String searchTokenCandidate ) { return COMMON_WORDS.contains( searchTokenCandidate ); } private static boolean textHasContent( String text ) { return ( text != null ) && ( !text.trim().equals( "" ) ); } private static void addNonTrivialWordToResult( String token, Set<String> result ){ if ( textHasContent( token ) && !isCommonWord( token.trim() ) ) { result.add( token.trim() ); } } private static boolean isDoubleQuote( String token ){ return token.equals( DOUBLE_QUOTE ); } private static String switchDelimiters( String currentDelim ) { String result = null; if ( currentDelim.equals( WHITESPACE_AND_QUOTES ) ) { result = QUOTES_ONLY; } else { result = WHITESPACE_AND_QUOTES; } return result; } }