/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.target.classifier;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Random;
import java.util.Scanner;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.SAXException;
import focusedCrawler.util.string.StopList;
import focusedCrawler.util.string.StopListFile;
import focusedCrawler.util.vsm.VSMElement;
import focusedCrawler.util.vsm.VSMElementComparator;
import focusedCrawler.util.vsm.VSMVector;
import weka.classifiers.functions.SMO;
import weka.classifiers.trees.RandomForest;
/**
* <p> </p>
*
* <p>Description: </p>
*
* <p>Copyright: Copyright (c) 2004</p>
*
* <p> </p>
*
* @author Luciano Barbosa
* @version 1.0
*/
public class WekaTargetClassifierBuilder {
private static Logger logger = LoggerFactory.getLogger(WekaTargetClassifierBuilder.class);
protected VSMVector[][] trainingExamples = null;
protected VSMVector[][] testExamples = null;
protected int numOfFeatures = Integer.MAX_VALUE;
protected int minDF = 5;
protected HashMap<String, VSMElement> df = new HashMap<>();
protected boolean isForm = false;
protected StopList stoplist;
public WekaTargetClassifierBuilder(File dir, File dirTest, StopList stoplist) throws SAXException, IOException {
this(dir,dirTest,stoplist,Integer.MAX_VALUE);
}
public WekaTargetClassifierBuilder(File input, File inputTest, StopList stoplist, int numOfElems) throws SAXException, IOException {
trainingExamples = new VSMVector[2][];
this.stoplist = stoplist;
if((new File (input + File.separator + "positive")).isDirectory()){
File[] positiveFiles = new File (input + File.separator + "positive").listFiles();
System.out.println("POSITIVE:" + positiveFiles.length);
File[] negativeFiles = new File (input + File.separator + "negative").listFiles();
System.out.println("NEGATIVE:" + negativeFiles.length);
int[] negIndexes = selectRandomNum(1,negativeFiles.length, numOfElems);
trainingExamples[1] = createVSM(negativeFiles, stoplist,negIndexes,true);
int[] posIndexes = selectRandomNum(1,positiveFiles.length, numOfElems);
trainingExamples[0] = createVSM(positiveFiles, stoplist,posIndexes,true);
}else{
trainingExamples[0] = createVSM(new File (input + File.separator + "positive"), stoplist);
trainingExamples[1] = createVSM(new File (input + File.separator + "negative"), stoplist);
}
if(inputTest != null){
testExamples = new VSMVector[2][];
if((new File (inputTest + File.separator + "positive")).isDirectory()){
File temp = new File (inputTest + File.separator + "positive");
System.out.println(temp.toString());
File[] positiveTestFiles = temp.listFiles();
trainingExamples[0] = createVSM(positiveTestFiles, stoplist,false);
File[] negativeTestFiles = new File (inputTest + File.separator + "negative").listFiles();
trainingExamples[1] = createVSM(negativeTestFiles, stoplist,false);
}else{
trainingExamples[0] = createVSM(new File (inputTest + File.separator + "positive"), stoplist);
trainingExamples[1] = createVSM(new File (inputTest + File.separator + "negative"), stoplist);
}
}
}
public WekaTargetClassifierBuilder(String[][] pages, StopList stoplist, int size) throws SAXException, IOException {
trainingExamples = new VSMVector[size][];
for (int i = 0; i < size; i++) {
String[] levelPages = pages[i];
trainingExamples[i] = createVSM(levelPages,stoplist);
}
}
private int[] selectRandomNum(long seed, int range, int elems){
if(elems > range){
elems = range;
}
int count = 0;
Random random = new Random(seed);
int next = random.nextInt(range);
HashSet<Integer> nums = new HashSet<>();
int[] result = new int[elems];
while(count < elems){
Integer num = new Integer(next);
if(!nums.contains(num)){
result[count] = next;
nums.add(num);
count++;
}
next = random.nextInt(range);
}
return result;
}
protected VSMVector[] createVSM(String[] pages, StopList stoplist) throws SAXException{
Vector<VSMVector> tempVSM = new Vector<VSMVector>();
for (int i = 0; i < pages.length; i++) {
try{
if(pages[i] == null){
continue;
}
VSMVector vsm = new VSMVector(pages[i],stoplist);
tempVSM.add(vsm);
Iterator<VSMElement> iterator1 = vsm.getElements();
while (iterator1.hasNext()) {
VSMElement elem = (VSMElement)iterator1.next();
VSMElement value = (VSMElement)df.get(elem.getWord());
if(value == null){
df.put(elem.getWord(), new VSMElement(elem.getWord(),1));
}else{
df.put(elem.getWord(), new VSMElement(elem.getWord(),value.getWeight() +1));
}
}
}catch(IOException ex){
ex.printStackTrace();
}
}
VSMVector[] examples = new VSMVector[tempVSM.size()];
tempVSM.toArray(examples);
return examples;
}
protected VSMVector[] createVSM(File file, StopList stoplist) throws SAXException{
Vector<VSMVector> tempVSM = new Vector<VSMVector>();
try{
BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));
for(String line = reader.readLine(); line != null; line = reader.readLine()){
VSMVector vsm = new VSMVector(line,stoplist);
tempVSM.add(vsm);
Iterator<VSMElement> iterator1 = vsm.getElements();
while (iterator1.hasNext()) {
VSMElement elem = (VSMElement)iterator1.next();
VSMElement value = (VSMElement)df.get(elem.getWord());
if(value == null){
df.put(elem.getWord(), new VSMElement(elem.getWord(),1));
}else{
df.put(elem.getWord(), new VSMElement(elem.getWord(),value.getWeight() +1));
}
}
}
reader.close();
}catch(IOException ex){
ex.printStackTrace();
}
VSMVector[] examples = new VSMVector[tempVSM.size()];
tempVSM.toArray(examples);
return examples;
}
protected VSMVector[] createVSM(File[] files, StopList stoplist, int[] indexes, boolean addToFeatures) throws SAXException{
Vector<VSMVector> tempVSM = new Vector<VSMVector>();
for (int i = 0; i < files.length && i < indexes.length; i++) {
try{
VSMVector vsm = new VSMVector(files[indexes[i]].toString(),isForm,stoplist);
tempVSM.add(vsm);
if(addToFeatures){
Iterator<VSMElement> iterator1 = vsm.getElements();
while (iterator1.hasNext()) {
VSMElement elem = (VSMElement)iterator1.next();
VSMElement value = (VSMElement)df.get(elem.getWord());
if(value == null){
df.put(elem.getWord(), new VSMElement(elem.getWord(),1));
}else{
df.put(elem.getWord(), new VSMElement(elem.getWord(),value.getWeight() +1));
}
}
}
}catch(IOException ex){
ex.printStackTrace();
}
}
VSMVector[] examples = new VSMVector[tempVSM.size()];
tempVSM.toArray(examples);
return examples;
}
protected VSMVector[] createVSM(File[] files, StopList stoplist, boolean addToFeatures) throws IOException, SAXException{
int[] indexes = new int[files.length];
for (int i = 0; i < indexes.length; i++) {
indexes[i] = i;
}
return createVSM(files, stoplist, indexes, addToFeatures);
}
Vector<String> attributes = new Vector<String>();
public String[] centroid2Weka(String output) throws FileNotFoundException,IOException {
OutputStream fout= new FileOutputStream(output,false);
OutputStream bout= new BufferedOutputStream(fout);
OutputStreamWriter outputFile = new OutputStreamWriter(bout);
StringBuffer header = new StringBuffer();
header.append("@RELATION TSFC");
header.append("\n");
header.append("\n");
StringBuffer tail = new StringBuffer();
Vector<VSMElement> bestWordsForm = new Vector<>(df.values());
Collections.sort(bestWordsForm, new VSMElementComparator());
for(int i=0; i<=numOfFeatures && i < bestWordsForm.size(); i++){
VSMElement elem = (VSMElement)bestWordsForm.elementAt(i);
if(elem.getWeight() > minDF){
header.append("@ATTRIBUTE ");
if(elem.getWord().equals("class")){
//This is a hack, weka does not allow attribute with name class.
elem.setWord("class-random-string");
}
header.append(elem.getWord());
attributes.add(elem.getWord());
header.append(" REAL");
header.append("\n");
}
}
header.append("@ATTRIBUTE class {");
for (int i = 0; i < trainingExamples.length-1; i++) {
header.append("CLASS_"+i+",");
}
header.append("CLASS_"+ (trainingExamples.length-1) +"}");
tail.append("\n");
tail.append("\n");
tail.append("@DATA");
tail.append("\n");
for (int l = 0; l < trainingExamples.length; l++) {
for (int i = 0; i < trainingExamples[l].length; i++) {
VSMVector formTemp = trainingExamples[l][i];
tail.append("{");
for (int j = 0; j < attributes.size(); j++) {
VSMElement elemForm = formTemp.getElement(attributes.elementAt(j));
if (elemForm != null){
tail.append(j);
tail.append(" ");
tail.append((int)elemForm.getWeight());
tail.append(",");
}
}
tail.append(attributes.size() + " CLASS_"+l+"}");
tail.append("\n");
}
}
outputFile.write(header.toString());
outputFile.flush();
outputFile.write(tail.toString());
outputFile.close();
if(testExamples != null){
createTestFile(output, bestWordsForm,header);
}
String[] atts = new String[attributes.size()];
attributes.toArray(atts);
return atts;
}
private void createTestFile(String output, Vector<VSMElement> bestWordsForm, StringBuffer header) throws
FileNotFoundException, IOException {
OutputStream fout= new FileOutputStream(output+"_test",false);
OutputStream bout= new BufferedOutputStream(fout);
OutputStreamWriter outputFile = new OutputStreamWriter(bout);
StringBuffer tail = new StringBuffer();
tail.append("\n");
tail.append("\n");
tail.append("@DATA");
tail.append("\n");
for (int l = 0; l < testExamples.length; l++) {
for (int i = 0; i < testExamples[l].length; i++) {
VSMVector examples = testExamples[l][i];
tail.append("{");
for (int j = 0; j < attributes.size(); j++) {
VSMElement elemForm = examples.getElement(attributes.elementAt(j));
if (elemForm != null){
tail.append(j);
tail.append(" ");
tail.append((int)elemForm.getWeight());
tail.append(",");
}
}
tail.append(attributes.size() + " CLASS_"+l+"}");
tail.append("\n");
}
}
outputFile.write(header.toString());
outputFile.flush();
outputFile.write(tail.toString());
outputFile.close();
}
public static void createInputFile(String stopwordsFile,
String trainingPath,
String wekaInputFile) {
try {
StopList stopwords = null;
File testDataPath = null;
File trainingDataPath = new File(trainingPath);
if(stopwordsFile != null && !stopwordsFile.isEmpty()) {
stopwords = new StopListFile(stopwordsFile);
} else {
stopwords = StopListFile.DEFAULT;
}
WekaTargetClassifierBuilder builder =
new WekaTargetClassifierBuilder(trainingDataPath, testDataPath, stopwords);
builder.centroid2Weka(wekaInputFile);
}
catch (SAXException | IOException ex1) {
throw new RuntimeException("Failed to generate weka input file.", ex1);
}
}
public static void trainModel(String trainingPath, String outputPath, String learner) {
if(learner==null) {
learner = "SMO";
}
System.out.println("Training "+ learner+" model...");
if(learner.equals("SMO")) {
SMO.main(new String[] {
"-M",
"-d", outputPath + "/pageclassifier.model",
"-t", trainingPath + "/weka.arff",
"-C", "0.01"
});
} else if(learner.equals("RandomForest")) {
RandomForest.main(new String[] {
// "-K", "5", // k-fold cross validation
"-I", "100", // Number of trees to build
"-d", outputPath + "/pageclassifier.model",
"-t", trainingPath + "/weka.arff"
});
} else {
System.out.println("Unknow learner: "+learner);
return;
}
}
public static void createFeaturesFile(String outputPath, String trainingPath) {
File features = new File(outputPath + File.separator + "pageclassifier.features");
try {
features.createNewFile();
FileWriter featuresWriter = new FileWriter(features);
//featuresWriter.write("");
featuresWriter.write("CLASS_VALUES S NS" + "\n" + "ATTRIBUTES");
String wekkaFilePath = trainingPath + "/weka.arff";
Scanner wekkaFileScanner = new Scanner(new File(wekkaFilePath));
while(wekkaFileScanner.hasNext()){
String nextLine = wekkaFileScanner.nextLine();
String[] splittedLine = nextLine.split(" ");
if(splittedLine.length>=3 && splittedLine[0].equals("@ATTRIBUTE") && splittedLine[2].equals("REAL"))
featuresWriter.write(" "+splittedLine[1]);
}
featuresWriter.write("\n");
wekkaFileScanner.close();
featuresWriter.flush();
featuresWriter.close();
} catch (IOException e) {
logger.error("IO Exception while creating wekka pageclassifier.features file. ",e);
}
}
}