package rainbownlp.util; import java.io.UnsupportedEncodingException; import java.math.BigInteger; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.HashMap; import java.util.Properties; import weka.core.Stopwords; import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; public class StringUtil { static Properties props = new Properties(); static StanfordCoreNLP pipeline = null; static{ props.put("annotators", "lemma"); pipeline = new StanfordCoreNLP(props, false); } /** * * @param inputString * @return MD5 hash of given string * @throws UnsupportedEncodingException * @throws NoSuchAlgorithmException */ public static String getStringDigest(String inputString) throws UnsupportedEncodingException, NoSuchAlgorithmException { MessageDigest md = MessageDigest.getInstance("MD5"); md.update(inputString.getBytes(), 0, inputString.length()); return new BigInteger(1, md.digest()).toString(16); } /** * Customized definition of stop words for word * @param word * @return */ public static boolean isStopWord(String word){ boolean isStopWord = false; if(word.length() < 2 || Stopwords.isStopword(word) || word.matches("\\W+") ) isStopWord = true; return isStopWord; } /** * Porter stem * @param word * @return stemmed word */ public static String getWordPorterStem(String word) { PorterStemmer stemmer = new PorterStemmer(); String stemmed_word = stemmer.stem(word).toLowerCase(); return stemmed_word; } public static String prepareSQLString(String sqlString) { sqlString = sqlString.replace("\\", "\\\\"). replace("'", "''"). replace("%", "\\%"). replace("_", "\\_"); return sqlString; } public static String castForRegex(String textContent) { return textContent.replace("\\","\\\\").replace("/","\\/").replace("*", "\\*").replace("+", "\\+").replace(".", "\\.").replace("?", "\\?") .replace(")", "\\)").replace("{", "\\{").replace("}", "\\}") .replace("(", "\\(").replace("[", "\\[").replace("]", "\\]").replace("%", "\\%"); } public static String decastRegex(String textContent) { return textContent.replace("\\\\","\\").replace("\\/","/").replace("\\*", "*").replace("\\+", "+").replace("\\.", ".").replace("\\?", "?") .replace("\\)", ")").replace("\\_", "_") .replace("\\{", "{").replace("\\}", "}").replace("\\(", "("). replace("\\[", "[").replace("\\]", "]").replace("\\%", "%"); } public static String getTermByTermPorter(String phrase) { String[] words = phrase.split(" "); String rootString = ""; for(int i=0;i<words.length;i++){ rootString += StringUtil.getWordPorterStem(words[i])+" "; } return rootString.trim(); } public static String compress(String text) { return text.replace(" ", "").replace(" ", ""); } static HashMap<String, String> lemmaCache = new HashMap<String, String>(); public static String getTermByTermWordnet(String phrase) { // if(lemmatiser ==null) // lemmatiser = new EngLemmatiser("/home/ehsan/rnlp/nlpdata/lemmatiser", // true, false); // String[] words = phrase.split(" "); // String rootString = ""; // for(int i=0;i<words.length;i++) // { // String lemma = lemmaCache.get(words[i]); // if(lemma == null) // { // lemma = lemmatiser.stem(words[i]); // lemmaCache.put(words[i], lemma); // } // rootString = rootString.concat(lemma+" "); // } StringBuilder rootStr = new StringBuilder(); Annotation document = pipeline.process(phrase); for(CoreMap sentence: document.get(SentencesAnnotation.class)) { for(CoreLabel token: sentence.get(TokensAnnotation.class)) { String word = token.get(TextAnnotation.class); String lemma = token.get(LemmaAnnotation.class); System.out.println(word+" --> lemmatized version :" + lemma); rootStr.append(lemma+" "); } } return rootStr.toString().trim(); } public static String getWordLemma(String word) { // if(lemmatiser ==null) // lemmatiser = new EngLemmatiser("/home/ehsan/rnlp/nlpdata/lemmatiser", // true, false); String word_lemma= ""; // // word_lemma= lemmatiser.stem(word); // return word_lemma; Annotation document = pipeline.process(word); for(CoreMap sentence: document.get(SentencesAnnotation.class)) { for(CoreLabel token: sentence.get(TokensAnnotation.class)) { String lemma = token.get(LemmaAnnotation.class); System.out.println(word+" --> lemmatized version :" + lemma); word_lemma += lemma+" "; } } return word_lemma; } public static boolean isDate(String token) { // not clear about the logic need testing // return !StringToTime.date(token).getClass().getName().endsWith("Boolean"); return false; } public static boolean isEmpty(String stringToCheck) { if(stringToCheck == null || stringToCheck.equals("")) return true; return false; } public static String concatArray(String string, String[] words) { StringBuilder concat = new StringBuilder(); for(int i=0;i<words.length;i++){ concat.append(words[i]+" "); } return concat.toString().trim(); } }