/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.cloudera.knittingboar.records;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
import org.apache.mahout.vectorizer.encoders.ConstantValueEncoder;
import org.apache.mahout.vectorizer.encoders.Dictionary;
import org.apache.mahout.vectorizer.encoders.FeatureVectorEncoder;
import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;
import com.cloudera.knittingboar.utils.DataUtils;
import com.google.common.base.Splitter;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Iterables;
import com.google.common.collect.Multiset;
import junit.framework.TestCase;
public class Test20NewsgroupsBookParsing extends TestCase {
private static Path workDir20NewsLocal = new Path(new Path("/tmp"), "Dataset20Newsgroups");
private static File unzipDir = new File( workDir20NewsLocal + "/20news-bydate");
private static String strKBoarTestDirInput = "" + unzipDir.toString() + "/KBoar-test/";
private static final int FEATURES = 10000;
private static Multiset<String> overallCounts;
/**
*
* Counts words
*
* @param analyzer
* @param words
* @param in
* @throws IOException
*/
private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
//System.out.println( "> ----- countWords ------" );
// use the provided analyzer to tokenize the input stream
TokenStream ts = analyzer.tokenStream("text", in);
ts.addAttribute(CharTermAttribute.class);
// for each word in the stream, minus non-word stuff, add word to collection
while (ts.incrementToken()) {
String s = ts.getAttribute(CharTermAttribute.class).toString();
//System.out.print( " " + s );
words.add(s);
}
//System.out.println( "\n<" );
/*overallCounts.addAll(words);*/
}
public void test20NewsgroupsFileScan() throws IOException {
// p.270 ----- metrics to track lucene's parsing mechanics, progress, performance of OLR ------------
double averageLL = 0.0;
double averageCorrect = 0.0;
double averageLineCount = 0.0;
int k = 0;
double step = 0.0;
int[] bumps = new int[]{1, 2, 5};
double lineCount = 0;
Splitter onColon = Splitter.on(":").trimResults();
// last line on p.269
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);
File file20News = DataUtils.getTwentyNewsGroupDir();
System.out.println( "written to: " + DataUtils.get20NewsgroupsLocalDataLocation() );
File base = new File(DataUtils.get20NewsgroupsLocalDataLocation() + "/20news-bydate-train");
overallCounts = HashMultiset.create();
// p.269 ---------------------------------------------------------
Map<String, Set<Integer>> traceDictionary = new TreeMap<String, Set<Integer>>();
// encodes the text content in both the subject and the body of the email
FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
encoder.setProbes(2);
encoder.setTraceDictionary(traceDictionary);
// provides a constant offset that the model can use to encode the average frequency
// of each class
FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
bias.setTraceDictionary(traceDictionary);
// used to encode the number of lines in a message
FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
lines.setTraceDictionary(traceDictionary);
Dictionary newsGroups = new Dictionary();
// bottom of p.269 ------------------------------
// because OLR expects to get integer class IDs for the target variable during training
// we need a dictionary to convert the target variable (the newsgroup name)
// to an integer, which is the newsGroup object
List<File> files = new ArrayList<File>();
for (File newsgroup : base.listFiles()) {
newsGroups.intern(newsgroup.getName());
System.out.println( ">> " + newsgroup.getName() );
files.addAll(Arrays.asList(newsgroup.listFiles()));
}
// mix up the files, helps training in OLR
Collections.shuffle(files);
System.out.printf("%d training files\n", files.size());
// ----- p.270 ------------ "reading and tokenzing the data" ---------
for (File file : files) {
BufferedReader reader = new BufferedReader(new FileReader(file));
// identify newsgroup ----------------
// convert newsgroup name to unique id
// -----------------------------------
String ng = file.getParentFile().getName();
int actual = newsGroups.intern(ng);
Multiset<String> words = ConcurrentHashMultiset.create();
// check for line count header -------
String line = reader.readLine();
while (line != null && line.length() > 0) {
// if this is a line that has a line count, let's pull that value out ------
if (line.startsWith("Lines:")) {
String count = Iterables.get(onColon.split(line), 1);
try {
lineCount = Integer.parseInt(count);
averageLineCount += (lineCount - averageLineCount) / Math.min(k + 1, 1000);
} catch (NumberFormatException e) {
// if anything goes wrong in parse: just use the avg count
lineCount = averageLineCount;
}
}
// which header words to actually count
boolean countHeader = (
line.startsWith("From:") || line.startsWith("Subject:")||
line.startsWith("Keywords:")|| line.startsWith("Summary:"));
// we're still looking at the header at this point
// loop through the lines in the file, while the line starts with: " "
do {
// get a reader for this specific string ------
StringReader in = new StringReader(line);
// ---- count words in header ---------
if (countHeader) {
//System.out.println( "#### countHeader ################*************" );
countWords(analyzer, words, in);
}
// iterate to the next string ----
line = reader.readLine();
} while (line.startsWith(" "));
//System.out.println("[break]");
}
// now we're done with the header
//System.out.println("[break-header]");
// -------- count words in body ----------
countWords(analyzer, words, reader);
reader.close();
/*
for (String word : words.elementSet()) {
//encoder.addToVector(word, Math.log(1 + words.count(word)), v);
System.out.println( "> " + word + ", " + words.count(word) );
}
*/
}
}
}