import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Scanner;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
public class Document implements Iterable<String> {
private enum Rule {NORM, POS};
private static final Rule TYPE = Rule.NORM;
/**
* word -> number of times the given word occurs in this document
*/
private Map<String, Integer> counts;
public Document(File f){
switch(TYPE){
case NORM:
this.counts = buildNorm(f);
break;
case POS:
this.counts = buildPos(f);
break;
}
}
private static Map<String, Integer> buildNorm(File f){
Map<String, Integer> rtn = new HashMap<String, Integer>();
try {
Scanner s = new Scanner(f);
boolean isHeader = true;
while(s.hasNextLine()){
String line = s.nextLine().trim();
if(isHeader && line.length() == 0) //Is the header over yet?
isHeader = false;
else if(!isHeader){
String[] tokens = line.split(" ");
for(String t : tokens){
t = t.trim().toLowerCase(); //remove whitespace and lower-case word
if(t.length() != 0){ //don't add empty strings
if(!rtn.containsKey(t))
rtn.put(t, 0);
rtn.put(t, rtn.get(t) + 1);
}
}
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
return rtn;
}
private static Map<String, Integer> buildPos(File f){
Map<String, Integer> rtn = new HashMap<String, Integer>();
try{
MaxentTagger tagger = new MaxentTagger("pos_tagger/models/left3words-wsj-0-18.tagger");
List<ArrayList<? extends HasWord>> sentences = MaxentTagger.tokenizeText(new BufferedReader(new FileReader(f)));
for (ArrayList<? extends HasWord> sentence : sentences) {
ArrayList<TaggedWord> tSentence = tagger.tagSentence(sentence);
for(TaggedWord w : tSentence){
String taggedWord = w.word() + "_" + w.tag();
if(!rtn.containsKey(taggedWord))
rtn.put(taggedWord, 0);
rtn.put(taggedWord, rtn.get(taggedWord) + 1);
}
}
}catch(Exception e){
e.printStackTrace();
}
return rtn;
}
/**
* @param token word to look up
* @return number of times the given token occurs in this document
*/
public Integer getCount(String token){
if(!this.counts.containsKey(token))
return 0;
else
return this.counts.get(token);
}
/**
* Iterates over all of the unique words in the document
*/
public Iterator<String> iterator(){
return this.counts.keySet().iterator();
}
}