package edu.uncc.cs.watsonsim;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import edu.stanford.nlp.util.CacheMap;
/**
*@author Jagan Vujjini
*/
public class StringUtils extends org.apache.commons.lang3.StringUtils {
private static Analyzer analyzer = new StandardAnalyzer();
//private static Database db = new Database(); // Used for semantic distribution
public static final int CONTEXT_LENGTH = 1000;
//private static final int CONTEXT_HASH_COUNT = 20;
private static final int CACHE_SIZE = 256;
private static CacheMap<String, ArrayList<Double>> context_cache_map = new CacheMap<String, ArrayList<Double>>(CACHE_SIZE);
/**
* Try to canonicalize a string somewhat conservatively.
* Basically, we:
* ignore case
* ignore punctuation such as (){}\\\\/\\[\\]—<>;:,.\"'“”‘’«»「」…-
* The ' is probably debatable. The rest are generally
* inaudible so they wouldn't have counted in a question
* had they been spoken anyway.
* ignore stopwords and some stems
* This is the effect of Lucene's filters.
*/
public static String canonicalize(String dirty) {
dirty = dirty
.toLowerCase()
.replaceAll("[(){}\\\\/\\[\\]—<>;:,.\"'“”‘’«»「」…-]", "")
.trim();
StringBuilder clean = new StringBuilder();
for (String token : tokenize(dirty)) {
clean.append(token);
clean.append(' ');
}
return clean.toString().trim();
}
/**
* Remove all characters except alphanumerics and space.
*
* This is a pretty wild thing to do since it clears out tons of usually
* useful stuff, like accent marks, punctuation, capitals..
*/
public static String sanitize(String input) {
return input.replaceAll("[^A-Za-z0-9 ]", " ");
}
/** splits the given string into tokens */
public static List<String> tokenize(String text) {
List<String> tokens = new ArrayList<>();
try (TokenStream tokenStream = analyzer.tokenStream("text", text)) {
//TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(text));
//tokenStream = new org.apache.lucene.analysis.core.StopFilter(Version.LUCENE_46, tokenStream, EnglishAnalyzer.getDefaultStopSet());
CharTermAttribute token = tokenStream.addAttribute(CharTermAttribute.class);
// On the fence whether it is better to error here or not. Suggestions?
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokens.add(token.toString());
}
} catch (IOException e) {
// If we can't trim it, so what?
e.printStackTrace();
}
return tokens;
}
/** Conservatively normalize a string while tokenizing it */
public static List<String> conservativeTokenize(String text) {
String[] token_arr = text.toLowerCase().split("[ \t~`@#$%^&\\*\\(\\)_\\+-=\\{\\}\\[\\]:\";'<>\\?,./\\|\\\\]+");
return Arrays.asList(token_arr);
}
/** Returns true if every non-stopword from candidate is found in reference */
public static boolean matchSubset(String candidate, String reference){
// Match these two sets in linear (or linearithmic) time
HashSet<String> reference_terms = new HashSet<String>();
reference_terms.addAll(StringUtils.tokenize(candidate));
return reference_terms.containsAll(StringUtils.tokenize(reference));
}
/**
* Fetch and merge the phrase contexts from a database.
* The safe part about this is that it may give the wrong answer but not
* an exception.
* @param phrase
* @return the merged phrase vector, unless an error occurred.
*/
/*public static ArrayList<Double> getPhraseContextSafe(String phrase) {
ArrayList<Double> merged_context = context_cache_map.get( phrase);
if (merged_context == null) {
merged_context = new ArrayList<>();
for (int i=0; i<CONTEXT_LENGTH; i++) merged_context.add(0.0);
// Filter repeated words
// word_set = S.toList $ S.fromList $ words phrase
PreparedStatement context_retriever = db.prep("SELECT context, count FROM rindex WHERE word == ?;");
HashSet<String> word_set = new HashSet<String>();
word_set.addAll(StringUtils.conservativeTokenize(phrase));
// Sum the context vectors
// foldl' (V.zipWith (+)) (V.replicate 1000) context_vectors
try {
for (String word : word_set) {
context_retriever.setString(1, word);
ResultSet sql_context = context_retriever.executeQuery();
if (sql_context.next()) {
java.nio.DoubleBuffer buffer = java.nio.ByteBuffer.wrap(sql_context.getBytes(1)).asDoubleBuffer();
double total = 0;
// Normalize each word so that they have the same weight when combined
for (int i=0; i<CONTEXT_LENGTH; i++)
total += buffer.get(i);
for (int i=0; i<CONTEXT_LENGTH; i++)
merged_context.set(i, merged_context.get(i) + (buffer.get(i) / total));
}
}
} catch (SQLException e) {} // At worst, return what we have so far. Maybe nothing.
}
context_cache_map.put(phrase, merged_context);
return merged_context;
}*/
/**
* Find the cosine similarity between two vectors
* 1 is identical, 0 is orthogonal
* Synonyms are usually between 0.6 and 0.8.
* @param vec1
* @param vec2
* @return double between 0 and 1
*/
public static double getCosineSimilarity(ArrayList<Double> vec1, ArrayList<Double> vec2) {
double xy = 0;
double xsquared = 0;
double ysquared = 0;
int length = Math.min(vec1.size(), vec2.size());
for (int i=0; i<length; i++) {
double x = vec1.get(i);
double y = vec2.get(i);
// Ignore uncertain dimensions
// This little kludge makes a big difference
if (Math.max(Math.abs(x), Math.abs(y)) > 0.1) {
xy += x * y;
xsquared += x * x;
ysquared += y * y;
}
}
return xy / (Math.sqrt(xsquared) * Math.sqrt(ysquared) + Double.MIN_NORMAL);
}
}