package context.core.task.bigram; import java.io.File; import java.util.ArrayList; import java.util.List; import context.core.entity.CorpusData; import context.core.entity.FileData; import context.core.util.CodebookUtils; import context.core.util.DefaultValueHashMap; import context.core.util.ForAggregation; import context.core.util.JavaIO; /** * * @author Aale */ public class BigramBody { private BigramApplicationTaskInstance instance; private CorpusData input; private DefaultValueHashMap baseWords; private DefaultValueHashMap bigrams; private int totalBigrams; /** * * @param instance */ public BigramBody(BigramApplicationTaskInstance instance) { // TODO Auto-generated method stub this.instance = instance; init(); } private void init() { this.input = (CorpusData) instance.getInput(); int[] defaultVal = {0,0}; totalBigrams = 0; baseWords = new DefaultValueHashMap(defaultVal); bigrams = new DefaultValueHashMap(0); } /** * * @return */ public boolean getMutualInfo(){ List<FileData> files = input.getFiles(); try{ for (FileData ff : files) { File file = ff.getFile(); String fullText = JavaIO.readFile(file); List<ArrayList<String>> sentences = new ArrayList<ArrayList<String>>(); ArrayList<String> tempSent = new ArrayList<String>(); for (String text: CodebookUtils.make_sentences(fullText)){ if (text == null){ sentences.add((ArrayList<String>) tempSent.clone()); tempSent.clear(); continue; } tempSent.add(text); } for (ArrayList<String> listSentence: sentences){ String sentence = ""; for (String s : listSentence) { sentence += s + " "; } sentence = sentence.replaceAll("[^A-Za-z0-9\\w ]", ""); sentence = sentence.replaceAll("LRB", ""); sentence = sentence.replaceAll("RRB", ""); String temp = ""; for (String word: sentence.split("\\s+")){ if (temp.equals("")){ temp = word; continue; } totalBigrams++; String[] keyStrings = {temp,word}; ForAggregation key = new ForAggregation(keyStrings); bigrams.put(key,((int) bigrams.get(key))+1); int[] baseValTemp = {0,0}; baseValTemp = ((int[])baseWords.get(temp)).clone(); baseValTemp[0]++; baseWords.put(temp, baseValTemp.clone()); int[] baseValWord = {0,0}; baseValWord = ((int[])baseWords.get(word)).clone(); baseValWord[1]++; baseWords.put(word, baseValWord.clone()); temp = word; } } } }catch(Exception e){ e.printStackTrace(); return false; } return true; } /** * * @param filepath */ public void writeOutput(String filepath) { //Write CSV this.writeCsv(baseWords, bigrams, totalBigrams, filepath); } private void writeCsv(DefaultValueHashMap baseWords, DefaultValueHashMap bigrams, int totalBigrams, String filepath) { StringBuffer sb = new StringBuffer(); //sb.append("Bigram,Frequency,Mutual Information\n"); //Separate bigramstring to Word a & Word b -- by Ming Jiang sb.append("Word a,Word b,Frequency,Mutual Information\n"); String toWrite = ""; for (Object bigram: bigrams.keySet().toArray()) { String[] bigramStr = ((ForAggregation) bigram).toAggregate; try{ int checker = ((int[])baseWords.get("brave"))[0]; } catch(Exception e){ e.printStackTrace(); } double check = (double)((int[])baseWords.get(bigramStr[0]))[0]; double probX = ((double)((int[])baseWords.get(bigramStr[0]))[0])/(double)totalBigrams; double probY = ((double)((int[])baseWords.get(bigramStr[1]))[1])/(double)totalBigrams; double probXY = ((double)((int)bigrams.get(bigram)))/((double)totalBigrams); double mutualInfo = (double) (probXY*Math.log(probXY/(probX*probY))/Math.log(2)); //Separate bigramstring to Word a & Word b -- by Ming Jiang toWrite = bigramStr[0] + "," + bigramStr[1] + "," + Integer.toString((int)bigrams.get(bigram)) + "," + Double.toString(mutualInfo) + "\n"; sb.append(toWrite); } // System.out.println("size of string to write=" + sb.toString().length()); // 2016.03 Add this code to delete existing file File toDelete = new File(filepath); if (toDelete.exists()) { toDelete.delete(); } FileData.writeDataIntoFile(sb.toString(), filepath); } }