package focusedCrawler.query;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.io.StringReader;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Vector;
import org.apache.commons.codec.binary.Base64;
import org.apache.xerces.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import focusedCrawler.target.classifier.TargetClassifier;
import focusedCrawler.target.classifier.TargetClassifierException;
import focusedCrawler.target.classifier.WekaTargetClassifier;
import focusedCrawler.target.model.Page;
import focusedCrawler.util.ParameterFile;
import focusedCrawler.util.parser.PaginaURL;
import focusedCrawler.util.string.PorterStemmer;
import focusedCrawler.util.string.StopList;
import focusedCrawler.util.string.StopListFile;
import focusedCrawler.util.vsm.VSMElement;
import focusedCrawler.util.vsm.VSMVector;
import weka.classifiers.Classifier;
import weka.core.Instances;
@SuppressWarnings("unused")
public class RelevanceFeedback {
private String appID = "87E3D4B83FBB733A778CCB8CF950FF62BD6243FC";
private String urlQuery = "http://api.bing.net/xml.aspx?AppId=87E3D4B83FBB733A778CCB8CF950FF62BD6243FC&Version=2.2&Market=en-US&Sources=web+spell&web.count=50&xmltype=attributebased&Query=";
private int connectTimeout = 5000;
private int readTimeout = 5000;
private int iterations = 20;
private StopList stoplist = null;
private TargetClassifier classifier;
private int total = 0;
private VSMVector positive = new VSMVector();
private VSMVector negative = new VSMVector();
// private HashMap<String,Integer> urlCode = new HashMap<String, Integer>();
private Vector<HashSet<String>> usedQueries = new Vector<HashSet<String>>();
private HashSet<String> usedURLs = new HashSet<String>();
// private InvertedIndex invIndex = new InvertedIndex();
public RelevanceFeedback(){
}
public RelevanceFeedback(StopList stoplist, TargetClassifier classifier) throws MalformedURLException, IOException, SAXException{
this.stoplist = stoplist;
this.classifier = classifier;
}
public void execute(String initialQuery) throws MalformedURLException, IOException, SAXException, TargetClassifierException{
Page page = downloadResults(initialQuery);
// System.out.println("Page:" + page.getContent());
HashSet<String> temp = new HashSet<String>();
PorterStemmer stem = new PorterStemmer();
String[] queryWords = initialQuery.split("\\+");
for (int j = 0; j < queryWords.length; j++) {
temp.add(queryWords[j].trim());
usedTerms.add(stem.stem(queryWords[j].trim()));
}
String currentQuery = initialQuery;
usedQueries.add(temp);
previousQuery = initialQuery;
for (int i = 0; i < iterations; i++) {
int count = 0;
System.out.println("Iteration:" + i);
String[] urls = parseXMLPage(page);
int newPages = 0;
Vector<Page> tempPages = new Vector<Page>();
VSMVector positiveTemp = new VSMVector();
VSMVector negativeTemp = new VSMVector();
for (int j = 0; j < urls.length; j++) {
if(!usedURLs.contains(urls[j]) && !urls[j].endsWith("pdf") && !urls[j].endsWith("pps")){
newPages++;
// System.out.println("URL:" + urls[j]);
page = downloadPage(new URL(urls[j]));
if(page != null){
boolean relevant = classifier.classify(page).isRelevant();
if(relevant){
System.out.println(getClass().getName());
tempPages.add(page);
VSMVector positiveTemp1 = new VSMVector(page.getContentAsString(),stoplist,false);
positiveTemp1.normalize();
positiveTemp.addVector(positiveTemp1);
count++;
}else{
// System.out.println("NonRelevant:" + urls[j]);
VSMVector negativeTemp1 = new VSMVector(page.getContentAsString(),stoplist,false);
negativeTemp1.normalize();
negativeTemp.addVector(negativeTemp1);
}
}
usedURLs.add(urls[j]);
}
}
Page[] pages = new Page[tempPages.size()];
tempPages.toArray(pages);
total = total + count;
double currentPrecision = (double)count/(double)newPages;
currentQuery = nextQuery(currentPrecision,newPages,currentQuery,positiveTemp,negativeTemp);
System.out.println("PREICSION:" + currentPrecision);
System.out.println("NEW PAGES:" + newPages);
System.out.println("NEXT QUERY:" + currentQuery);
page = downloadResults(currentQuery);
}
System.out.println("TOTAL RELEVANT:" + total);
}
private int previousQuerySize = 2;
private String previousQuery = "";
private int pointer = 1;
private HashSet<String> usedTerms = new HashSet<String>();
private int offsetCounter = 1;
private String nextQuery(double currentPrecision, int currentUnique, String currentQuery, VSMVector pos, VSMVector neg){
String offset = "&$skip=50";
PorterStemmer stem = new PorterStemmer();
boolean changeQuery = false;
if(currentPrecision > 0.5){
positive.addVector(pos);
negative.addVector(neg);
}else{
changeQuery = true;
}
if(currentUnique <= 0.5){
changeQuery = true;
}
double proportionUnique = (double)currentUnique/(double)50;
if(proportionUnique <= 0.5){
changeQuery = true;
}
if(!changeQuery && (offsetCounter/50) +1 <= 6){
int index = currentQuery.indexOf(offset);
if(index != -1){
previousQuery = currentQuery.substring(0,currentQuery.indexOf(offset));
}else{
previousQuery = currentQuery;
}
offsetCounter = offsetCounter+50;
return previousQuery + offset+offsetCounter;
}
offsetCounter = 1;
int index = currentQuery.indexOf(offset);
String tempQuery = "";
if(index != -1){
tempQuery = currentQuery.substring(0,currentQuery.indexOf(offset));
}else{
tempQuery = currentQuery;
}
String[] currentQueryWords = tempQuery.split("\\+");
for (int i = 0; i < currentQueryWords.length; i++) {
VSMElement elem = positive.getElement(currentQueryWords[i]);
elem.setWeight(currentPrecision*elem.getWeight());
positive.addElement(elem);
}
VSMVector tempPos = new VSMVector();
tempPos.addVector(positive);
// tempPos.normalize();
// System.out.println(invIndex.toString());
VSMVector tempNeg = new VSMVector();
tempNeg.addVector(negative);
// tempNeg.normalize();
tempNeg.negativeVector();
tempPos.addVector(tempNeg);
String queryTemp = null;
VSMElement[] elems = tempPos.topElements(tempPos.getArrayElements().length);
for (int i = 0; i < elems.length && i < 10; i++) {
System.out.print(elems[i].getWord() + " ");
}
System.out.println("");
int querySize = previousQuerySize;
HashSet<String> qWords = new HashSet<String>();
if(currentPrecision <= 0.5){
querySize++;
}else{
if(currentUnique <= 0.5){
querySize--;
}
}
int usedTermsSize = usedTerms.size();
if(queryTemp == null){
queryTemp = elems[0].getWord();
String tempWord = stem.stem(elems[0].getWord());
if(!usedTerms.contains(tempWord)){
usedTerms.add(tempWord);
}
if(!qWords.contains(tempWord)){
qWords.add(tempWord);
}
for (int i = pointer,step=1; step < querySize-1; i++,step++) {
tempWord = stem.stem(elems[i].getWord());
if(!usedTerms.contains(tempWord)){
usedTerms.add(tempWord);
}
if(!qWords.contains(tempWord)){
queryTemp = queryTemp + "+" + elems[i].getWord();
qWords.add(tempWord);
}else{
step--;
}
}
}
previousQuerySize = querySize;
// pointer++;
/*Using clusters*/
// StringBuffer query = new StringBuffer();
// query.append("(");
// for (int i = 0; i < clusters.length; i++) {
// if(i > 0){
// query.append(")+OR+(");
// }
// VSMElement[] elems = clusters[i].getCentroid().topElements(clusters[i].getCentroid().getArrayElements().length);
// query.append(elems[0].getWord());
// for (int j = 1; j < querySize; j++) {
// query.append("+" + elems[j].getWord());
// }
// }
// query.append(")");
int lastElement = querySize-1;
String query = "";
boolean newQuery = false;
while(!newQuery){
boolean newQueryTemp = false;
String tempWord = stem.stem(elems[lastElement].getWord());
if(usedTermsSize < usedTerms.size() || !qWords.contains(tempWord) && !usedTerms.contains(tempWord)){
newQuery = true;
query = queryTemp + "+" + elems[lastElement].getWord();
if(!usedTerms.contains(tempWord)){
usedTerms.add(tempWord);
}
// for (int i = 0; i < usedQueries.size(); i++) {
// newQueryTemp = false;
// HashSet<String> set = usedQueries.elementAt(i);
// String[] queryWords = query.split("\\+");
// for (int j = 0; j < queryWords.length; j++) {
// if(!set.contains(stem.stem(queryWords[j]))){
// newQueryTemp = true;
// }
// }
// if(!newQueryTemp){
// newQuery = false;
// }
//
// }
}
lastElement++;
}
HashSet<String> temp = new HashSet<String>();
String[] queryWords = query.split("\\+");
for (int j = 0; j < queryWords.length; j++) {
temp.add(stem.stem(queryWords[j].trim()));
}
usedQueries.add(temp);
// negative = new VSMVector();
// positive = new VSMVector();
// Vector<WordFrequencyMap> temp = new Vector<WordFrequencyMap>(relevantSample.values());
// Collections.sort(temp,new WordFrequencyComparator());
// String query = null;
// for(int i = 0;i<temp.size() && query == null;i++){
// if(!usedQueries.contains(temp.elementAt(i).getWord())){
// query = temp.elementAt(i).getWord();
// usedQueries.add(query);
// }
// }
previousQuery = currentQuery;
return query.toString();
}
private void addToSample(Page page, HashMap<String, WordFrequencyMap> sample){
PaginaURL pageParser = null;
pageParser = new PaginaURL(page.getURL(),page.getContentAsString(), stoplist);
// System.out.println("URL>>>"+page.getURL());
String[] words = pageParser.palavras();
int[] occurrencies = pageParser.ocorrencias();
for (int i = 0; i < words.length; i++) {
int frequency = 1;
Object value = sample.get(words[i]);
if(value != null){
frequency = ((WordFrequencyMap)value).getFrequency() + 1;
}
if(!words[i].equals("-") && !words[i].equals("&") && words[i].length() > 2){
sample.put(words[i], new WordFrequencyMap(words[i],frequency));
}
}
}
private Page downloadResults(String keyword){
String top = "5";
keyword = keyword.replaceAll(" ", "%20");
String accountKey="d9zIG4ICwyPiUzBz0pDB9fvGr/UKDqk82fYBlJlXmhc";
byte[] accountKeyBytes = Base64.encodeBase64((accountKey + ":" + accountKey).getBytes());
String accountKeyEnc = new String(accountKeyBytes);
URL url = null;
StringBuilder output = new StringBuilder();
try {
url = new URL("https://api.datamarket.azure.com/Data.ashx/Bing/SearchWeb/v1/Web?Query=%27" + keyword + "%27&$top="+ top);
System.out.println(url);
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("Authorization", "Basic " + accountKeyEnc);
BufferedReader br = new BufferedReader(new InputStreamReader((conn.getInputStream())));
String line;
while ((line = br.readLine()) != null) {
output = output.append(line);
}
// System.out.println(output);
conn.disconnect();
} catch (MalformedURLException e1) {
e1.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return new Page(url, output.toString());
}
private Page downloadPage(URL urlCon) throws IOException {
System.out.println("Downloading URL:" + urlCon.toString());
// try {
// Thread.sleep(100);
// } catch (InterruptedException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
URLConnection yc = urlCon.openConnection();
// System.out.println("SETTING TIMEUOT...");
yc.setConnectTimeout(connectTimeout);
yc.setReadTimeout(readTimeout);
if(yc.getContentType()!=null && !yc.getContentType().contains("text") || urlCon.toString().endsWith("rtf")){
return null;
}
StringBuilder buffer = new StringBuilder();
try{
BufferedReader inCon = new BufferedReader(new InputStreamReader(yc.
getInputStream()));
String inputLine;
while ((inputLine = inCon.readLine()) != null) {
buffer.append(inputLine);
}
inCon.close();
}catch(java.lang.IllegalArgumentException ex){
// System.out.println("ILLEGAL ARGUMENT!!!\n");
return null;
}
catch(java.net.SocketTimeoutException ex){
// System.out.println("TIMEOUT EXCEPTION!!!\n");
return null;
}
catch(java.io.FileNotFoundException ex) {
// System.out.println("REMOTE FILE NOT FOUND!!!\n");
return null;
}
catch(java.net.UnknownHostException ex) {
// System.out.println("UNKNOWN HOST!!!\n");
return null;
}catch(Exception ex){
// System.out.println("Generic Exception\n");
return null;
}
Page pageRes = new Page(urlCon, buffer.toString());
// System.out.println("FINISHED TO DOWNLOAD THE PAGE : " + urlCon.toString() + "\n");
// System.out.println(pageRes.getContent());
return pageRes;
}
private String[] parseXMLPage(Page page) throws SAXException, IOException{
DOMParser parser = new DOMParser();
Vector<String> urls = new Vector<String>();
parser.parse(new InputSource(new ByteArrayInputStream(page.getContent())));
Document doc = parser.getDocument();
NodeList list = doc.getElementsByTagName("d:Url");
for (int j = 0; j < list.getLength(); j++) {
Node node = list.item(j);
NodeList children = node.getChildNodes();
Node child = children.item(0);
System.out.println(child.getTextContent());
urls.add(child.getTextContent());
// NamedNodeMap attrs = node.getAttributes();
// for (int i = 0; i < attrs.getLength(); i++) {
// Node attr = attrs.item(i);
// String attrName = ((attr.getNodeName().trim()).toLowerCase());
// String attrValue = ((attr.getNodeValue().trim()).toLowerCase());
// if(attrName.equals("url")){
// urls.add(attrValue);
//// System.out.println(attrName+":"+attrValue);
// }
// }
}
String[] res = new String[urls.size()];
urls.toArray(res);
return res;
}
public static void main(String[] args) {
// RelevanceFeedback rf = new RelevanceFeedback();
// try {
// rf.execute("Human Traffic");
// } catch (IOException | SAXException
// | TargetClassifierException e) {
// // TODO Auto-generated catch block
// e.printStackTrace();
// }
try {
ParameterFile config = new ParameterFile(args[0]);
StopList stoplist = new StopListFile(config.getParam("STOPLIST_FILES"));
InputStream is = new FileInputStream(config.getParam("FILE_CLASSIFIER"));
ObjectInputStream objectInputStream = new ObjectInputStream(is);
Classifier classifier = (Classifier) objectInputStream.readObject();
objectInputStream.close();
String[] attributes = config.getParam("ATTRIBUTES", " ");
weka.core.FastVector vectorAtt = new weka.core.FastVector();
for (int i = 0; i < attributes.length; i++) {
vectorAtt.addElement(new weka.core.Attribute(attributes[i]));
}
String[] classValues = config.getParam("CLASS_VALUES", " ");
weka.core.FastVector classAtt = new weka.core.FastVector();
for (int i = 0; i < classValues.length; i++) {
classAtt.addElement(classValues[i]);
}
vectorAtt.addElement(new weka.core.Attribute("class", classAtt));
Instances insts = new Instances("target_classification", vectorAtt, 1);
insts.setClassIndex(attributes.length);
TargetClassifier targetClassifier = new WekaTargetClassifier(classifier, config.getParamDouble("RELEVANCE_THRESHOLD"), insts, attributes, stoplist);
RelevanceFeedback rf = new RelevanceFeedback(stoplist, targetClassifier);
rf.execute(args[1]);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (SAXException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (TargetClassifierException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ClassNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}