/* * To change this template, choose Tools | Templates * and open the template in the editor. */ package edu.columbia.stat.wood.bnol; import edu.columbia.stat.wood.bnol.util.MutableInt; import edu.columbia.stat.wood.bnol.util.Pair; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.Arrays; import java.util.HashMap; /** * * @author nicholasbartlett */ public class ProcessCHILDES { private File data; private HashMap<String, Integer> encoder = new HashMap(); private HashMap<Integer,String> dictionary = new HashMap(); private int size; public ProcessCHILDES(File data) throws IOException{ this.data = data; initialize(); } public int size(){ return size; } public HashMap<Integer, String> dictionary(){ return dictionary; } public int[] get(int length) throws IOException{ if(length > size){ length = size; } int[] d = new int[length]; CHILDESIterator iter = new CHILDESIterator(); for(int i = 0; i < length; i++){ d[i] = iter.next(); } iter.close(); return d; } private void initialize() throws IOException{ BufferedReader br = null; size = 0; try { br = new BufferedReader(new FileReader(data)); String line; while ((line = br.readLine()) != null){ String[] words = line.trim().split("[\\s]+"); for(String word : words){ size++; word = word.toLowerCase(); if(encoder.get(word) == null){ dictionary.put(encoder.size(), word); encoder.put(word, encoder.size()); } } } } finally { if (br != null){ br.close(); } } } private class CHILDESIterator { private String[] words; private int index = 0; private BufferedReader br; private HashMap<String, Integer> enc = encoder; public CHILDESIterator() throws FileNotFoundException, IOException{ br = new BufferedReader(new FileReader(data)); words = br.readLine().split("[\\s]+"); } public boolean hasNext(){ return index < words.length; } public int next() throws IOException{ int next = enc.get(words[index++].toLowerCase()); while(index == words.length){ String line = br.readLine(); if(line == null){ return next; } else { words = line.trim().split("[\\s]+"); index = 0; } } return next; } public void close() throws IOException{ br.close(); } } public static void main(String[] args) throws IOException{ File data = new File("/Users/nicholasbartlett/Documents/np_bayes/Bayesian_Nonparametric_Ontology_Learning/data/_CHILDES.parsed.txt"); ProcessCHILDES pc = new ProcessCHILDES(data); //int[] d = pc.get(10); //System.out.println(Arrays.toString(d)); System.out.println(pc.size()); System.out.println(pc.dictionary().size()); /*for(int word : d){ System.out.println(pc.dictionary().get(word)); }*/ File f = new File("/Users/nicholasbartlett/Documents/np_bayes/Bayesian_Nonparametric_Ontology_Learning/data/token_counts.txt"); BufferedReader br = null; HashMap<String, MutableInt> countMap = new HashMap(); try{ br = new BufferedReader(new FileReader(f)); String line; while((line = br.readLine()) != null){ line = line.trim(); line = line.toLowerCase(); String[] words = line.split("[\\s]+"); String key = words[1]; int count = Integer.valueOf(words[0]).intValue(); MutableInt value = countMap.get(key); if(value == null){ countMap.put(key, new MutableInt(count)); } else { value.plusEquals(count); } } } finally { if (br != null) br.close(); } for(String key : countMap.keySet()){ Integer remove = pc.encoder.remove(key); if(remove == null){ System.out.println(key); } } System.out.println("---------------------------------------------------"); for(String key : pc.encoder.keySet()){ System.out.println(key); } System.out.println(countMap.size()); } }