/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.link.classifier.builder;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import focusedCrawler.target.model.Page;
import focusedCrawler.target.model.ParsedData;
import focusedCrawler.util.parser.LinkNeighborhood;
import focusedCrawler.util.parser.PaginaURL;
import focusedCrawler.util.string.PorterStemmer;
import focusedCrawler.util.string.StopList;
/**
* <p>Description: This class from a predefined set of words extracts for
* each link the frequency of these words given a page. These words are
* the features used by the Link Classifier.</p>
*
* <p>Copyright: Copyright (c) 2004</p>
*
*
* @author Luciano Barbosa
* @version 1.0
*/
public class LinkNeighborhoodWrapper {
private StopList stoplist = null;
private String[][] fieldWords;
private PorterStemmer stemmer = new PorterStemmer();
public LinkNeighborhoodWrapper(StopList stoplist) {
this.stoplist = stoplist;
}
public LinkNeighborhoodWrapper(String[] features, StopList stoplist) {
this.stoplist = stoplist;
this.setFeatures(features);
}
public LinkNeighborhoodWrapper() {
}
public void setFeatures(String[][] fieldWords){
this.fieldWords = fieldWords;
}
public void setFeatures(String[] features){
String[][] fieldWords = new String[WordField.FIELD_NAMES.length][];
List<String> aroundTemp = new ArrayList<String>();
List<String> altTemp = new ArrayList<String>();
List<String> srcTemp = new ArrayList<String>();
List<String> urlTemp = new ArrayList<String>();
List<String> anchorTemp = new ArrayList<String>();
for (int i = 0; i < features.length; i++) {
if(features[i].startsWith("around_")){
String[] parts = features[i].split("_");
aroundTemp.add(parts[1]);
}
if(features[i].startsWith("alt_")){
String[] parts = features[i].split("_");
altTemp.add(parts[1]);
}
if(features[i].startsWith("src_")){
String[] parts = features[i].split("_");
srcTemp.add(parts[1]);
}
if(features[i].startsWith("url_")){
String[] parts = features[i].split("_");
urlTemp.add(parts[1]);
}
if(features[i].startsWith("anchor_")){
String[] parts = features[i].split("_");
anchorTemp.add(parts[1]);
}
}
String[] around = new String[aroundTemp.size()];
aroundTemp.toArray(around);
fieldWords[WordField.AROUND] = around;
String[] alt = new String[altTemp.size()];
altTemp.toArray(alt);
fieldWords[WordField.ALT] = alt;
String[] src = new String[srcTemp.size()];
srcTemp.toArray(src);
fieldWords[WordField.SRC] = src;
String[] url = new String[urlTemp.size()];
urlTemp.toArray(url);
fieldWords[WordField.URLFIELD] = url;
String[] anchor = new String[anchorTemp.size()];
anchorTemp.toArray(anchor);
fieldWords[WordField.ANCHOR] = anchor;
this.fieldWords = fieldWords;
}
/**
* Extract the information from links as word in the URL, anchor and around
* links
* @param page Page page containing the links to be obtained
* @param features String[] pre-defined words
* @return HashMap mapping url -> instance
* @throws MalformedURLException
*/
public HashMap<String, Instance> extractLinks(Page page, String[] features) throws MalformedURLException {
HashMap<String, WordField[]> linkFields = extractLinks(page);
return mapFeatures(linkFields, features);
}
public HashMap<String, Instance> extractLinks(LinkNeighborhood[] linkNeighboors, String[] features) throws MalformedURLException {
HashMap<String, WordField[]> linkFields = extractLinks(linkNeighboors);
return mapFeatures(linkFields, features);
}
public HashMap<String, Instance> extractLinks(LinkNeighborhood linkNeighboor, String[] features) throws MalformedURLException {
HashMap<String, WordField[]> linkFields = extractLinks(linkNeighboor);
return mapFeatures(linkFields, features);
}
public HashMap<String, Instance> extractLinksFull(LinkNeighborhood linkNeighboor, String[] features) throws MalformedURLException {
HashMap<String, WordField[]> linkFields = extractLinksFull(linkNeighboor);
return mapFeatures(linkFields, features);
}
private HashMap<String, Instance> mapFeatures(HashMap<String, WordField[]> linkFields, String[] features){
HashMap<String, Instance> result = new HashMap<String, Instance>();
Iterator<String> wordsFields = linkFields.keySet().iterator();
while (wordsFields.hasNext()) {
Instance instance = new Instance(features);
// System.out.println(">>>Instance:"+instance.toString());
String url = (String)wordsFields.next();
// System.out.println("1.URL:" + url);
WordField[] words = (WordField[])linkFields.get(url);
for(int j = 0; j < words.length; j++){
WordField wordField = words[j];
String field = (WordField.FIELD_NAMES[wordField.getField()]).toLowerCase();
String word = wordField.getWord();
if(wordField.getField() == WordField.URLFIELD || wordField.getField() == WordField.SRC){
// if(wordField.getField() == WordField.SRC){
// System.out.println("D1:" + wordField.getWord());
// }
List<String> wordsTemp = searchSubstring(wordField.getWord(),wordField.getField());
for (int i = 0; i < wordsTemp.size(); i++) {
word = wordsTemp.get(i);
word = field + "_" + word;
instance.setValue(word, new Double(1));
}
}else{
// word = stemming(word);
if(word != null){
word = field + "_" + word;
instance.setValue(word, new Double(1));
}
}
// System.out.println(">>>AFTER Instance:"+instance.toString());
}
// System.out.println(">>INST..." + instance.toString());
result.put(url,instance);
}
return result;
}
private String stemming(String word) {
String new_word = "";
try {
new_word = stemmer.stem(word);
if (new_word.indexOf("No term") != -1 || new_word.indexOf("Invalid term") != -1) {
new_word = word;
}
}
catch (Exception e) {
new_word = word;
}
return new_word;
}
private List<String> searchSubstring(String word, int field){
List<String> result = new ArrayList<String>();
String[] words = fieldWords[field];
for (int i = 0; i < words.length; i++) {
String tempWord = words[i];
int index = tempWord.indexOf("_");
if(index != -1){
tempWord = tempWord.substring(index+1);
}
if(word != null && word.toLowerCase().equals(tempWord)){
// System.out.println(">>>" + word);
result.add(tempWord);
}
}
return result;
}
private HashMap<String, WordField[]> extractLinksFull(LinkNeighborhood ln) throws MalformedURLException {
HashMap<String, WordField[]> result = new HashMap<String, WordField[]>();
List<WordField> words = new ArrayList<WordField>();
String urlStr = ln.getLink().toString();
getURLWords(urlStr, words);
if(ln.getImgSrc() != null){
PaginaURL pageParser = new PaginaURL(new URL("http://"),ln.getImgSrc(), stoplist);
String[] terms = pageParser.palavras();
for (int i = 0; i < terms.length; i++) {
// System.out.println(">>TERM:" + terms[i]);
words.add(new WordField(WordField.SRC, stemming(terms[i])));
}
}
String[] anchor = ln.getAnchor();
for (int j = 0; j < anchor.length; j++) {
WordField wf = new WordField(WordField.ANCHOR, stemming(anchor[j]));
words.add(wf);
}
String[] around = ln.getAround();
for (int j = 0; j < around.length; j++) {
words.add(new WordField(WordField.AROUND, stemming(around[j])));
}
String[] alt = ln.getImgAlt();
for (int j = 0; alt != null && j < alt.length; j++) {
words.add(new WordField(WordField.ALT, stemming(alt[j])));
}
WordField[] wordsReturn = null;
if (words.size() > 0) {
wordsReturn = new WordField[words.size()];
words.toArray(wordsReturn);
result.put(urlStr, wordsReturn);
}
return result;
}
private HashMap<String, WordField[]> extractLinks(LinkNeighborhood ln) throws MalformedURLException {
HashMap<String, WordField[]> result = new HashMap<String, WordField[]>();
List<WordField> words = new ArrayList<WordField>();
String urlStr = ln.getLink().toString();
getURLWords(urlStr, words);
String[] anchor = ln.getAnchor();
for (int j = 0; j < anchor.length; j++) {
words.add(new WordField(WordField.ANCHOR, stemming(anchor[j])));
}
String[] around = ln.getAround();
for (int j = 0; j < around.length; j++) {
words.add(new WordField(WordField.AROUND, stemming(around[j])));
}
WordField[] wordsReturn = null;
if (words.size() > 0) {
wordsReturn = new WordField[words.size()];
words.toArray(wordsReturn);
result.put(urlStr, wordsReturn);
}
return result;
}
private HashMap<String, WordField[]> extractLinks(Page page) throws MalformedURLException {
ParsedData parsedData = page.getParsedData();
LinkNeighborhood[] linkNeighboors = parsedData.getLinkNeighborhood();
return extractLinks(linkNeighboors);
}
private HashMap<String, WordField[]> extractLinks(LinkNeighborhood[] linkNeighboors) throws MalformedURLException {
HashMap<String, WordField[]> result = new HashMap<String, WordField[]>();
for (int i = 0; i < linkNeighboors.length; i++) {
List<WordField> words = new ArrayList<WordField>();
LinkNeighborhood ln = linkNeighboors[i];
String urlStr = ln.getLink().toString();
getURLWords(urlStr,words);
String[] anchor = ln.getAnchor();
for (int j = 0; j < anchor.length; j++) {
words.add(new WordField(WordField.ANCHOR,anchor[j]));
}
String[] around = ln.getAround();
for (int j = 0; j < around.length; j++) {
words.add(new WordField(WordField.AROUND,around[j]));
}
WordField[] wordsReturn = null;
if (words.size() > 0) {
wordsReturn = new WordField[words.size()];
words.toArray(wordsReturn);
result.put(urlStr,wordsReturn);
}
}
return result;
}
/**
* Put the words in URL into bag of words
*
* @param urlStr String
* @param wordsFields Vector list of word
* @throws MalformedURLException
*/
private void getURLWords(String urlStr, List<WordField> wordsFields) throws MalformedURLException {
URL url = new URL(urlStr);
String host = url.getHost();
int index = host.lastIndexOf(".");
if(index != -1){
host = "host_" + host.substring(index+1);
wordsFields.add(new WordField(WordField.URLFIELD,host));
}
PaginaURL pageParser = new PaginaURL(url,url.getFile(), stoplist);
String[] terms = pageParser.palavras();
for (int i = 0; i < terms.length; i++) {
wordsFields.add(new WordField(WordField.URLFIELD,stemming(terms[i])));
}
}
}