package focusedCrawler.link.classifier.builder;
import java.io.BufferedOutputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.Vector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import focusedCrawler.link.BipartiteGraphRepository;
import focusedCrawler.link.classifier.LinkClassifier;
import focusedCrawler.link.classifier.LinkClassifierFactory;
import focusedCrawler.link.frontier.Frontier;
import focusedCrawler.util.parser.LinkNeighborhood;
import focusedCrawler.util.parser.PaginaURL;
import focusedCrawler.util.persistence.Tuple;
import focusedCrawler.util.string.PorterStemmer;
import focusedCrawler.util.string.StopList;
import focusedCrawler.util.vsm.VSMElement;
import focusedCrawler.util.vsm.VSMElementComparator;
import weka.classifiers.Classifier;
import weka.core.Instances;
import weka.core.SerializationHelper;
public class LinkClassifierBuilder {
private static final Logger logger = LoggerFactory.getLogger(LinkClassifierBuilder.class);
private BipartiteGraphRepository graphRep;
private LinkNeighborhoodWrapper wrapper;
private StopList stoplist;
private PorterStemmer stemmer;
private Frontier frontier;
private String[] features;
private Path linkClassifierFolder;
public LinkClassifierBuilder(String dataPath, BipartiteGraphRepository graphRep, StopList stoplist, LinkNeighborhoodWrapper wrapper, Frontier frontier){
this.graphRep = graphRep;
this.stemmer = new PorterStemmer();
this.stoplist = stoplist;
this.wrapper = wrapper;
this.frontier = frontier;
this.linkClassifierFolder = Paths.get(dataPath, "/link_classifier/");
if (!Files.exists(linkClassifierFolder)) {
try {
Files.createDirectories(linkClassifierFolder);
} catch (IOException e) {
throw new RuntimeException("Failed to create link classifier folder: "
+ linkClassifierFolder.toString(), e);
}
}
}
public synchronized LinkClassifier forwardlinkTraining(HashSet<String> relSites, int levels, String className) throws Exception{
Vector<Vector<LinkNeighborhood>> instances = loadTrainingInstances(relSites, levels);
String wekaInputAsString = createWekaInput(instances, false);
logger.info("Training new link classifier...");
Classifier classifier = trainWekaClassifier(wekaInputAsString);
String modelFile = linkClassifierFolder.resolve("link_classifier.model").toString();
String featuresFile = linkClassifierFolder.resolve("link_classifier.features").toString();
logger.info("Link Clasifier model file: "+modelFile);
logger.info("Link Clasifier features file: "+featuresFile);
SerializationHelper.write(modelFile, classifier);
writeFeaturesFile(featuresFile, features);
String[] classValues = null;
if (levels == 0) {
classValues = new String[] {"POS", "NEG"};
} else {
classValues = new String[] {"0", "1", "2"};
}
return LinkClassifierFactory.createLinkClassifierImpl(
features, classValues, classifier, className, levels);
}
public Classifier trainWekaClassifier(String wekaInputAsString) throws Exception {
Instances data = null;
try(StringReader reader = new StringReader(wekaInputAsString)) {
data = new Instances(reader);
}
data.setClassIndex(data.numAttributes() - 1);
Classifier classifier = new weka.classifiers.functions.SMO();
classifier.setOptions(weka.core.Utils.splitOptions("-C 1.0 -L 0.0010 -P 1.0E-12 -N 0 -M -V -1 -W 1 -K \"weka.classifiers.functions.supportVector.PolyKernel -C 250007 -E 1.0\" -no-cv"));
classifier.buildClassifier(data);
return classifier;
}
private void writeFeaturesFile(String featuresFile, String[] features)
throws FileNotFoundException, IOException {
OutputStream fout = new FileOutputStream(featuresFile, false);
OutputStream bout = new BufferedOutputStream(fout);
OutputStreamWriter outputFile = new OutputStreamWriter(bout);
for (int i = 0; i < features.length; i++) {
outputFile.write(features[i] + " ");
}
outputFile.close();
}
private Vector<Vector<LinkNeighborhood>> loadTrainingInstances(
HashSet<String> relSites, int levels)
throws Exception,
MalformedURLException,
IOException {
Vector<Vector<LinkNeighborhood>> instances = null;
if (levels == 0) {// pos and neg case
instances = new Vector<Vector<LinkNeighborhood>>(2);
instances.add(new Vector<LinkNeighborhood>());
instances.add(new Vector<LinkNeighborhood>());
} else { // levels case
instances = new Vector<Vector<LinkNeighborhood>>(levels);
for (int i = 0; i < levels; i++) {
instances.add(new Vector<LinkNeighborhood>());
}
}
HashSet<String> visitedLinks = frontier.visitedLinks();
for (Iterator<String> iterator = visitedLinks.iterator(); iterator.hasNext();) {
URL url = new URL(iterator.next());
LinkNeighborhood ln = graphRep.getLN(url);
if (ln == null) {
continue;
}
if (levels == 0) {
if (relSites.contains(url.toString())) {
instances.elementAt(0).add(ln);
} else {
if (instances.elementAt(1).size() < instances.elementAt(0).size()) {
instances.elementAt(1).add(ln);
}
}
} else {
if (relSites.contains(ln.getLink().toString())) {
instances.elementAt(0).add(ln);
addBacklinks(instances, ln.getLink(), 1, levels, relSites);
}
}
}
return instances;
}
private void addBacklinks(Vector<Vector<LinkNeighborhood>> instances, URL url, int level, int limit, HashSet<String> relSites) throws IOException{
if(level >= limit){
return;
}
LinkNeighborhood[] backlinks = graphRep.getBacklinksLN(url);
for (int i = 0; i < backlinks.length; i++) {
URL tempURL = backlinks[i].getLink();
if(!relSites.contains(tempURL.toString())){
instances.elementAt(level).add(backlinks[i]);
}
addBacklinks(instances,tempURL,level+1,limit,relSites);
}
}
public LinkClassifier backlinkTraining(HashMap<String,VSMElement> outlinkWeights) throws Exception{
// HashMap<String,VSMElement> sitesCount = new HashMap<String, VSMElement>();
Vector<VSMElement> trainingSet = new Vector<VSMElement>();
Tuple<String>[] tuples = graphRep.getHubGraph();
for (int i = 0; i < tuples.length; i++) {
String hubId = tuples[i].getKey();
String[] outlinks = tuples[i].getValue().split("###");
double totalProb = 0;
for (int j = 0; j < outlinks.length; j++) {
VSMElement elem = outlinkWeights.get(outlinks[j]+"_auth");
if(elem != null){
totalProb = totalProb + elem.getWeight();
}
}
String url = graphRep.getHubURL(hubId);
if(url != null && outlinks.length > 20){
LinkNeighborhood ln = graphRep.getBacklinkLN(new URL(url));
if(ln != null){
VSMElement elem = new VSMElement(ln.getLink().toString() + ":::" + ln.getAroundString(), totalProb/outlinks.length);
trainingSet.add(elem);
}
}
}
System.out.println("TOTAL TRAINING:" + trainingSet.size());
Vector<Vector<LinkNeighborhood>> instances = new Vector<Vector<LinkNeighborhood>>(2);
Vector<LinkNeighborhood> posSites = new Vector<LinkNeighborhood>();
Vector<LinkNeighborhood> negSites = new Vector<LinkNeighborhood>();
instances.add(posSites);
instances.add(negSites);
Collections.sort(trainingSet,new VSMElementComparator());
Vector<LinkNeighborhood> allLNs = new Vector<LinkNeighborhood>();
for (int i = 0; i < trainingSet.size(); i++) {
String[] parts = trainingSet.elementAt(i).getWord().split(":::");
LinkNeighborhood ln = new LinkNeighborhood(new URL(parts[0]));
if(parts.length > 1){
StringTokenizer tokenizer = new StringTokenizer(parts[1]," ");
Vector<String> aroundTemp = new Vector<String>();
while(tokenizer.hasMoreTokens()){
aroundTemp.add(tokenizer.nextToken());
}
String[] aroundArray = new String[aroundTemp.size()];
aroundTemp.toArray(aroundArray);
ln.setAround(aroundArray);
}
allLNs.add(ln);
}
int sampleSize = Math.min(5000,allLNs.size()/2);
for (int i = 0; i < allLNs.size(); i++) {
if(posSites.size() < sampleSize){
posSites.add(allLNs.elementAt(i));
}
}
for (int i = allLNs.size()-1; i >= 0 ; i--) {
if(negSites.size() < sampleSize){
negSites.add(allLNs.elementAt(i));
}
}
LinkNeighborhood[] pos = new LinkNeighborhood[posSites.size()];
posSites.toArray(pos);
LinkNeighborhood[] neg = new LinkNeighborhood[negSites.size()];
negSites.toArray(neg);
String wekaInput = createWekaInput(instances, true);
Classifier classifier = trainWekaClassifier(wekaInput);
String[] classValues = new String[]{"POS","NEG"};
return LinkClassifierFactory.createLinkClassifierImpl(features, classValues, classifier, "LinkClassifierHub",0);
}
/**
* Creates the weka input file
*
* @param instances
* @param backlink
* @return
* @throws IOException
*/
private String createWekaInput(Vector<Vector<LinkNeighborhood>> instances, boolean backlink)
throws IOException {
StringBuffer output = new StringBuffer();
output.append("@relation classifier\n");
Vector<LinkNeighborhood> allInstances = new Vector<LinkNeighborhood>();
for (int i = 0; i < instances.size(); i++) {
allInstances.addAll(instances.elementAt(i));
}
features = selectBestFeatures(allInstances, backlink);
for (int i = 0; i < features.length; i++) {
output.append("@attribute " + features[i] + " REAL \n");
}
output.append("@attribute class {");
for (int i = 1; i < instances.size(); i++) {
output.append(i + ",");
}
output.append(instances.size() + "}\n");
output.append("\n");
output.append("@data\n");
output.append(generatLines(features, instances));
return output.toString();
}
/**
* This method creates the a line in the weka file for each instance
*
* @param features
* @param instances
* @return
* @throws IOException
*/
private String generatLines(String[] features, Vector<Vector<LinkNeighborhood>> instances)
throws IOException {
StringBuffer buffer = new StringBuffer();
for (int i = 0; i < instances.size(); i++) {
Vector<LinkNeighborhood> level = instances.elementAt(i);
System.out.println(level.size());
for (int j = 0; j < level.size(); j++) {
LinkNeighborhood ln = level.elementAt(j);
StringBuffer line = new StringBuffer();
HashMap<String, Instance> featureValue = wrapper.extractLinks(ln, features);
Iterator<String> iter = featureValue.keySet().iterator();
while (iter.hasNext()) {
String url = (String) iter.next();
Instance instance = (Instance) featureValue.get(url);
double[] values = instance.getValues();
line.append("{");
boolean containsValue = false;
for (int l = 0; l < values.length; l++) {
if (values[l] > 0) {
containsValue = true;
line.append(l + " " + (int) values[l]);
line.append(",");
}
}
line.append(values.length + " " + (i + 1));
line.append("}");
line.append("\n");
if (containsValue) {
buffer.append(line);
} else {
line = new StringBuffer();
}
}
}
}
return buffer.toString();
}
/**
* This method selects the features to be used by the classifier.
* @param allNeighbors
* @param backlink
* @return
* @throws MalformedURLException
*/
private String[] selectBestFeatures(Vector<LinkNeighborhood> allNeighbors, boolean backlink) throws MalformedURLException{
Vector<String> finalWords = new Vector<>();
Set<String> usedURLTemp = new HashSet<>();
Map<String, WordFrequency> urlWords = new HashMap<>();
Map<String, WordFrequency> anchorWords = new HashMap<>();
Map<String, WordFrequency> aroundWords = new HashMap<>();
for (int l = 0; l < allNeighbors.size(); l++) {
LinkNeighborhood element = allNeighbors.elementAt(l);
//anchor
String[] anchorTemp = element.getAnchor();
for (int j = 0; j < anchorTemp.length; j++) {
String word = stemmer.stem(anchorTemp[j]);
if(word == null || stoplist.isIrrelevant(word)){
continue;
}
WordFrequency wf = (WordFrequency) anchorWords.get(word);
if (wf != null) {
anchorWords.put(word, new WordFrequency(word, wf.getFrequency()+1));
}
else {
anchorWords.put(word, new WordFrequency(word, 1));
}
}
//around
String[] aroundTemp = element.getAround();
for (int j = 0; j < aroundTemp.length; j++) {
String word = stemmer.stem(aroundTemp[j]);
if(word == null || stoplist.isIrrelevant(word)){
continue;
}
WordFrequency wf = (WordFrequency) aroundWords.get(word);
if (wf != null) {
aroundWords.put(word, new WordFrequency(word, wf.getFrequency()+1));
}
else {
aroundWords.put(word, new WordFrequency(word, 1));
}
}
//url
if(!usedURLTemp.contains(element.getLink().toString())){
usedURLTemp.add(element.getLink().toString());
PaginaURL pageParser = new PaginaURL(new URL("http://"),element.getLink().getFile().toString(), stoplist);
String[] urlTemp = pageParser.palavras();
for (int j = 0; j < urlTemp.length; j++) {
// String word = stemmer.stem(urlTemp[j]);
String word = urlTemp[j];
if(stoplist.isIrrelevant(word)){
continue;
}
WordFrequency wf = (WordFrequency) urlWords.get(word);
if (wf != null) {
urlWords.put(word, new WordFrequency(word, wf.getFrequency()+1));
}
else {
urlWords.put(word, new WordFrequency(word, 1));
}
}
}
}
String[][] fieldWords = new String[WordField.FIELD_NAMES.length][];
Vector<WordFrequency> aroundVector = new Vector<>(aroundWords.values());
Collections.sort(aroundVector,new WordFrequencyComparator());
FilterData filterData1 = new FilterData(100,2);
Vector<WordFrequency> aroundFinal = filterData1.filter(aroundVector,null);
String[] aroundTemp = new String[aroundFinal.size()];
// System.out.println("AROUND:"+aroundVector);
for (int i = 0; i < aroundFinal.size(); i++) {
WordFrequency wf = aroundFinal.elementAt(i);
// System.out.println("around_"+wf.getWord() + ":" + wf.getFrequency());
finalWords.add("around_"+wf.getWord());
aroundTemp[i] = wf.getWord();
}
fieldWords[WordField.AROUND] = aroundTemp;
Vector<WordFrequency> urlVector = new Vector<>(urlWords.values());
// System.out.println("URL1:"+urlVector);
Collections.sort(urlVector,new WordFrequencyComparator());
FilterData filterData2 = new FilterData(150,2);
@SuppressWarnings("unchecked")
Vector<WordFrequency> urlFinal = filterData2.filter(urlVector,(Vector<WordFrequency>)aroundFinal.clone());
String[] urlTemp = new String[urlFinal.size()];
// String[] urlTemp = new String[3];
// System.out.println("URL:"+urlVector);
for (int i = 0; i < urlTemp.length; i++) {
WordFrequency wf = urlFinal.elementAt(i);
// System.out.println("url_"+wf.getWord() + ":" + wf.getFrequency());
finalWords.add("url_"+wf.getWord());
urlTemp[i] = wf.getWord();
}
fieldWords[WordField.URLFIELD] = urlTemp;
if(!backlink){
Vector<WordFrequency> anchorVector = new Vector<>(anchorWords.values());
Collections.sort(anchorVector, new WordFrequencyComparator());
FilterData filterData3 = new FilterData(150,2);
Vector<WordFrequency> anchorFinal = filterData3.filter(anchorVector,null);
String[] anchorTemp = new String[anchorFinal.size()];
// System.out.println("ANCHOR:"+anchorVector);
for (int i = 0; i < anchorFinal.size(); i++) {
WordFrequency wf = anchorFinal.elementAt(i);
// System.out.println("anchor_"+wf.getWord() + ":" + wf.getFrequency());
finalWords.add("anchor_"+wf.getWord());
anchorTemp[i] = wf.getWord();
}
fieldWords[WordField.ANCHOR] = anchorTemp;
}
wrapper.setFeatures(fieldWords);
String[] features = new String[finalWords.size()];
finalWords.toArray(features);
return features;
}
}