package com.manning.hsia.dvdstore.util;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
public class ChapterWordcount {
static final Set<String> exclusionList;
static {
Set<String> excludeList = new HashSet<String>();
excludeList.add("a");
excludeList.add("the");
excludeList.add("para");
excludeList.add("is");
excludeList.add("of");
excludeList.add("and");
excludeList.add("to");
excludeList.add("in");
excludeList.add("listitem");
excludeList.add("title");
excludeList.add("indexterm");
excludeList.add("primary");
excludeList.add("be");
excludeList.add("will");
excludeList.add("on");
excludeList.add("for");
excludeList.add("by");
excludeList.add("not");
excludeList.add("are");
excludeList.add("this");
excludeList.add("section");
excludeList.add("or");
excludeList.add("The");
excludeList.add("it");excludeList.add("don");
excludeList.add("is");excludeList.add("me");excludeList.add("s");
excludeList.add("from");excludeList.add("t");excludeList.add("id");excludeList.add("long");
excludeList.add("that");excludeList.add("don'");excludeList.add("her");excludeList.add("puts");
excludeList.add("an");excludeList.add("additional");excludeList.add("was");excludeList.add("important");
excludeList.add("have");excludeList.add("They");excludeList.add("comes");excludeList.add("able");
excludeList.add("with");excludeList.add("easy");excludeList.add("provided");excludeList.add("let's");
excludeList.add("as");excludeList.add("How");excludeList.add("To");excludeList.add("different");
excludeList.add("you");excludeList.add("several");excludeList.add("probably");excludeList.add("typical");
excludeList.add("more");excludeList.add("possible");excludeList.add("three");excludeList.add("no");excludeList.add("understand");
excludeList.add("book");excludeList.add("which");excludeList.add("because");excludeList.add("need");
excludeList.add("itemizelist");excludeList.add("see");excludeList.add("fairly");excludeList.add("depending");
excludeList.add("figure");excludeList.add("Unfortunately");excludeList.add("unfortunately");excludeList.add("higher");
excludeList.add("very");excludeList.add("loading");excludeList.add("both");excludeList.add("It");
excludeList.add("when");excludeList.add("much");excludeList.add("end");excludeList.add("go");excludeList.add("some");
excludeList.add("mediaobject");excludeList.add("make");excludeList.add("want");excludeList.add("given");
excludeList.add("they");excludeList.add("last");excludeList.add("its");excludeList.add("them");
excludeList.add("imageobject");excludeList.add("other");excludeList.add("first");excludeList.add("two");
excludeList.add("This");excludeList.add("could");excludeList.add("those");excludeList.add("between");
excludeList.add("into");excludeList.add("example");excludeList.add("In");excludeList.add("quite");
excludeList.add("has");excludeList.add("such");excludeList.add("lot");excludeList.add("using");
excludeList.add("your");excludeList.add("up");excludeList.add("these");excludeList.add("chapter");
excludeList.add("what");excludeList.add("would");excludeList.add("does");excludeList.add("do");
excludeList.add("each");excludeList.add("use");excludeList.add("so");excludeList.add("common");
excludeList.add("most");excludeList.add("know");excludeList.add("if");excludeList.add("provide");
excludeList.add("very");excludeList.add("also");excludeList.add("emphasis");excludeList.add("only");
excludeList.add("their");excludeList.add("than");excludeList.add("used");excludeList.add("fileref");
excludeList.add("but");excludeList.add("imagedata");excludeList.add("full");excludeList.add("png");
excludeList.add("might");excludeList.add("I");excludeList.add("about");excludeList.add("through");
excludeList.add("way");excludeList.add("xref");excludeList.add("needs");excludeList.add("While");
excludeList.add("how");excludeList.add("all");
exclusionList = Collections.unmodifiableSet(excludeList);
}
/**
* @param args
* @throws IOException
*/
public static void main(String[] args) throws IOException {
final Map<String, Word> words = new HashMap<String, Word>(1000);
final List<String> wordList;
for (int chapter = 1 ; chapter <= 10 ; chapter++) {
chapterWordCount(words, chapter);
}
wordList = new ArrayList<String>( words.keySet() );
Collections.sort(wordList, new Comparator<String>() {
public int compare(String o1, String o2) {
String left = (String) o1;
String right = (String) o2;
return - words.get(left).count + words.get(right).count;
}
});
File export = new File("/Users/manu/Documents/book/docbook/index.csv");
FileWriter output = new FileWriter(export);
BufferedWriter writer = new BufferedWriter(output);
for ( String entry : wordList ) {
StringBuilder builder = new StringBuilder();
//if ( words.get(entry).equals( new Integer(1) ) ) break; //only one left
Word word = words.get(entry);
builder.append(entry).append(";").append(word.count);
for (int index = 0 ; index < 10 ; index++) {
builder.append(";").append(word.chapters[index]);
}
writer.write(builder.toString());
writer.newLine();
}
writer.close();
//System.out.println( builder.toString() );
}
private static void chapterWordCount(Map<String, Word> golbalWords, int chapter ) {
final String number = chapter == 10 ? "10" : "0" + (chapter);
final String name = "ch" + number + "/ch" + number + ".xml";
final Map<String, Word> words = golbalWords;
int wordCount = 0;
try {
File file = new File("/Users/manu/Documents/book/docbook/" + name);
FileReader input = new FileReader(file);
BufferedReader reader = new BufferedReader(input);
String line = reader.readLine();
while (line != null) {
StringTokenizer tokenizer = new StringTokenizer(line, " <:>,./\"()?!;*'", false);
wordCount++;
while ( tokenizer.hasMoreTokens() ) {
addToMap(tokenizer.nextToken(), words, chapter);
}
line = reader.readLine();
}
reader.close();
}
catch (Exception e) {
e.printStackTrace();
}
System.out.println("Chapter " + name + "raw word count=" + wordCount);
}
private static void addToMap(String rawNextToken, Map<String, Word> words, int chapter) {
String nextToken = rawNextToken.toLowerCase();
if (exclusionList.contains( nextToken ) ) return;
Word count = words.get(nextToken);
if (count == null) {
count = new Word();
count.chapters = new int[10];
}
count.count++;
count.chapters[chapter-1]++;
words.put(nextToken, count);
}
private static class Word {
public int count;
public int[] chapters;
}
}