package edu.usc.cssl.tacit.topicmodel.zlda.services; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import org.eclipse.core.runtime.OperationCanceledException; import org.eclipse.core.runtime.SubProgressMonitor; import edu.stanford.nlp.process.CoreLabelTokenFactory; import edu.stanford.nlp.process.PTBTokenizer; public class DTWC { private List<File> documents; private List<List<Integer>> docVectors; private int[][] docVectorsAsInt; private Map<String, Integer> termIndex; private Map<Integer, String> indexTerm; private Integer vocabSize; private File seedFile; private Map<String, Integer> termCount; private List<Map<Integer, List<Integer>>> topicSeeds; private int[][][] topicSeedsAsInt; private Map<Integer, Integer> seedWords; private SubProgressMonitor monitor; public Integer getVocabSize() { return vocabSize; } public Map<Integer, String> getIndexTerm(){ return indexTerm; } public File getSeedFile() { return seedFile; } public void setSeedFile(File seedFile) { this.seedFile = seedFile; } public List<Map<Integer, List<Integer>>> getTopicSeeds() { return topicSeeds; } public List<List<Integer>> getDocVectors(){ return docVectors; } public Map<String, Integer> getTermIndex() { return termIndex; } public int[][] getDocVectorsAsInt() { return docVectorsAsInt; } public int[][][] getTopicSeedsAsInt() { return topicSeedsAsInt; } private DTWC(List<File> docs, SubProgressMonitor monitor){ this.monitor = monitor; documents = docs; vocabSize = 0; docVectors = new ArrayList<List<Integer>>(); for(int i=0; i<docs.size(); i++){ docVectors.add(new ArrayList<Integer>()); } termIndex = new HashMap<String, Integer>(); termCount = new HashMap<String, Integer>(); initializeSeeds(); indexTerm = new HashMap<Integer, String>(); } public DTWC(List<File> docs, File seedFile, SubProgressMonitor monitor){ this(docs,monitor); this.seedFile = seedFile; } private void initializeSeeds(){ monitor.subTask("Initializing Seeds..."); topicSeeds = new ArrayList<Map<Integer, List<Integer>>>(); for(int i=0; i<docVectors.size(); i++){ topicSeeds.add(new HashMap<Integer, List<Integer>>()); } seedWords = new HashMap<Integer, Integer>(); } @SuppressWarnings({ "rawtypes"}) private void calculateTermIndicesAndVectors(){ String word; List<Integer> vector; for(int i=0; i<documents.size(); i++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } vector = docVectors.get(i); try { PTBTokenizer ptbtk = new PTBTokenizer(new FileReader(documents.get(i)), new CoreLabelTokenFactory(), ""); monitor.subTask("Tokenizing words..."); while(ptbtk.hasNext()){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } //word = ptbtk.next().toString().toLowerCase(); word = ptbtk.next().toString(); word.replaceAll("[^a-zA-Z0-9-_]", " "); if(word.matches("^[a-zA-Z0-9]*$")){ //word = stemmer.stem(word); if(termIndex.containsKey(word) == false){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } termIndex.put(word, vocabSize); indexTerm.put(vocabSize, word); termCount.put(word, 1); vector.add(vocabSize); vocabSize++; } else{ if (monitor.isCanceled()) { throw new OperationCanceledException(); } vector.add(termIndex.get(word)); termCount.put(word, termCount.get(word) + 1); } } } } catch(Exception e){ e.printStackTrace(); } } } public void computeDocumentVectors(){ monitor.subTask("Calculating Term Indices and Vectors"); calculateTermIndicesAndVectors(); if (monitor.isCanceled()) { throw new OperationCanceledException(); } monitor.worked(15); computeZSets(); monitor.subTask("Converting to Primitive Data Types"); convertToPrimitiveDataTypes(); monitor.worked(20); } /* Computing z-sets for these documents */ /* Read the seed words from the seed file */ private void constructTopicList(){ String line; String[] words; int topicNo = 0; try { BufferedReader br = new BufferedReader(new FileReader(seedFile)); while((line = br.readLine()) != null){ words = line.split(" "); for(int i=0; i<words.length; i++){ if(termIndex.containsKey(words[i])){ seedWords.put(termIndex.get(words[i]), topicNo); } } topicNo++; } br.close(); } catch (Exception e) { e.printStackTrace(); } } public void computeZSets(){ monitor.subTask("Constructing Topic list..."); constructTopicList(); monitor.worked(15); monitor.subTask("Computing Z Sets Topic list..."); List<Integer> doc; Map<Integer, List<Integer>> docTopicSeeds; Integer word; List<Integer> wordTopicList; for(int i=0; i<docVectors.size(); i++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } doc = docVectors.get(i); docTopicSeeds = topicSeeds.get(i); for(int j=0; j<doc.size(); j++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } word = doc.get((int)j); if(termCount.get(indexTerm.get(word)) > 5) { wordTopicList = docTopicSeeds.get(j); if(seedWords.containsKey(word)){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } if(wordTopicList == null){ wordTopicList = new ArrayList<Integer>(); docTopicSeeds.put(j, wordTopicList); } wordTopicList.add(seedWords.get(word)); } } } } monitor.worked(15); } private void convertToPrimitiveDataTypes(){ docVectorsAsInt = new int[docVectors.size()][]; for(int i=0; i<docVectors.size(); i++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } docVectorsAsInt[i] = new int[docVectors.get(i).size()]; for(int j=0; j<docVectors.get(i).size(); j++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } docVectorsAsInt[i][j] = docVectors.get(i).get(j); } } topicSeedsAsInt = new int[topicSeeds.size()][][]; for(int i=0; i<topicSeeds.size(); i++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } topicSeedsAsInt[i] = new int[docVectors.get(i).size()][]; for(int j=0; j<docVectors.get(i).size(); j++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } if(topicSeeds.get(i).get(j) != null){ topicSeedsAsInt[i][j] = new int[topicSeeds.get(i).get(j).size()]; for(int k=0; k<topicSeeds.get(i).get(j).size(); k++){ if (monitor.isCanceled()) { throw new OperationCanceledException(); } topicSeedsAsInt[i][j][k] = topicSeeds.get(i).get(j).get(k); } } } } } }