package context.core.util; import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation; import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.pipeline.Annotation; import edu.stanford.nlp.pipeline.StanfordCoreNLP; import edu.stanford.nlp.util.CoreMap; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileWriter; import java.io.IOException; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.charset.Charset; import java.util.List; import java.util.Properties; import java.util.Vector; /** * * @author Kiumars Soltani * */ public class CodebookUtils { /** * */ public static String log; /** * * @param file * @param content * @return */ public static int write_string(File file, String content) { log = ""; try { // if file doesnt exists, then create it if (!file.exists()) { file.createNewFile(); } FileWriter fw = new FileWriter(file.getAbsoluteFile()); BufferedWriter bw = new BufferedWriter(fw); bw.write(content); bw.close(); } catch (IOException e) { log += e.getMessage(); return 0; } return 1; } /** * * @param input * @param type * @param delim * @return */ public static Vector<String> getWords(String input, int type, String delim) { //1- sentence 2-paragraph 3- word 4- custom switch (type) { case 3: return CodebookUtils.make_words(input); case 1: return CodebookUtils.make_sentences(input); case 2: return CodebookUtils.make_paragraph(input); case 4: return CodebookUtils.make_custom_token(input, delim); } return null; } /** * * @param content * @return */ public static Vector<String> make_words(String content) { Vector<String> return_list = new Vector<String>(); Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); // System.out.println("contentInmakeWord1: "+ content); content = content.replaceAll("\\p{Cc}", " "); // System.out.println("contentInmakeWord2: "+ content); content = content.replaceAll("[^A-Za-z0-9 \\_ :;!\\?\\.,\'\"-]", " "); // System.out.println("contentInmakeWord3: "+ content); // 2016.03 process newline characters content= content.replaceAll("(\\r|\\n|\\r\\n)+", " "); System.out.println("contentInmakeWord4: "+ content); Annotation document = new Annotation(content); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String word = token.get(TextAnnotation.class); if (word != null) { //System.out.println(word); return_list.add(word); } } } return return_list; } /** * * @param content * @return */ public static Vector<String> make_sentences(String content) { Vector<String> return_list = new Vector<String>(); Properties props = new Properties(); props.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); content = content.replaceAll("\\p{Cc}", " "); content = content.replaceAll("[^A-Za-z0-9 \\_ :;!\\?\\.,\'\"-]", " "); // 2016.03 process newline characters content= content.replaceAll("(\\r|\\n|\\r\\n)+", " "); System.out.println("contentInmakeWord4: "+ content); Annotation document = new Annotation(content); pipeline.annotate(document); List<CoreMap> sentences = document.get(SentencesAnnotation.class); for (CoreMap sentence : sentences) { for (CoreLabel token : sentence.get(TokensAnnotation.class)) { String word = token.get(TextAnnotation.class); if (word != null) { return_list.add(word); } } return_list.add(null); } return return_list; } /** * * @param content * @return */ public static Vector<String> make_paragraph(String content) { // 2016.03 process newline characters content= content.replaceAll("(\\r|\\n|\\r\\n)+", "\\\\n"); System.out.println("contentInmakeWord4: "+ content); String[] ss = content.split("\\n"); Vector<String> return_list = new Vector<String>(); for (String s : ss) { return_list.addAll(make_words(s)); return_list.add(null); } return return_list; } /** * * @param content * @param token * @return */ public static Vector<String> make_custom_token(String content, String token) { String[] ss = content.split(token); Vector<String> return_list = new Vector<String>(); for (String s : ss) { return_list.addAll(make_words(s)); return_list.add(null); } return return_list; } /** * * @param input * @return */ public static String read_file(File input) { log = ""; String output = ""; FileInputStream stream; try { stream = new FileInputStream(input); try { FileChannel fc = stream.getChannel(); MappedByteBuffer bb = fc.map(FileChannel.MapMode.READ_ONLY, 0, fc.size()); /* Instead of using default, pass in a decoder. */ output = Charset.defaultCharset().decode(bb).toString(); } finally { stream.close(); } } catch (IOException e) { // TODO: handle exception log += e.getMessage(); return ""; } return output; } }