/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.util.vsm;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Vector;
import org.cyberneko.html.parsers.DOMParser;
import org.w3c.dom.Document;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import focusedCrawler.util.parser.PaginaURL;
import focusedCrawler.util.string.PorterStemmer;
import focusedCrawler.util.string.StopList;
/**
* <p>Title: </p>
*
* <p>Description: </p>
*
* @author not attributable
* @version 1.0
*/
public class VSMVector {
private HashMap<String, VSMElement> elems;
private PorterStemmer stemmer = new PorterStemmer();
private StopList stoplist;
private String id;
public VSMVector() {
this.elems = new HashMap<>();
}
public VSMVector(StopList stoplist) {
this.elems = new HashMap<>();
this.stoplist = stoplist;
}
public VSMVector(String file, boolean isForm, StopList stoplist) throws MalformedURLException, IOException, SAXException {
this.stoplist = stoplist;
this.elems = new HashMap<>();
if(isForm){
DOMParser parser = new DOMParser();
if((file.toLowerCase()).indexOf("<form ") != -1){//verify if the string is the name of file or the content of form
parser.parse(new InputSource(new BufferedReader(new StringReader(file))));
}else{
parser.parse(file);
}
String srcForm = "";
Document doc = parser.getDocument();
NodeList list = doc.getElementsByTagName("form");
StringBuffer source = new StringBuffer();
parse(list.item(0), source, new StringBuffer(), "html", stoplist);
srcForm = source.toString().toLowerCase();
PaginaURL formPage = new PaginaURL(new URL("http://www"),srcForm, stoplist);
stemPage(formPage, true);
} else {
StringBuffer content = new StringBuffer();
BufferedReader input = new BufferedReader(new FileReader(new File(
file)));
for (String line = input.readLine(); line != null;
line = input.readLine()) {
content.append(line);
content.append("\n");
}
input.close();
String src = content.toString();
PaginaURL page = new PaginaURL(new URL("http://www"), src, stoplist);
addTitle(page, stoplist);
stemPage(page, false);
}
}
private void addTitle(PaginaURL page, StopList stoplist) throws MalformedURLException{
this.stoplist = stoplist;
PaginaURL title = new PaginaURL(new URL("http://www"),page.titulo(), stoplist);
String[] titleWords = title.palavras();
String[] metaTerms = page.palavrasMeta();
int[] metaOccurrencies = page.ocorrenciasMeta();
for (int i = 0; i < metaTerms.length; i++) {
String word = metaTerms[i].toLowerCase();
word = stemmer.stem(word);
if(word.indexOf("No term") != -1){
continue;
}
if(word.length() > 2 ){
word = "meta" + word;
VSMElement vsmElem = this.getElement(word);
if(vsmElem == null){
this.addElement(new VSMElement(word,metaOccurrencies[i]));
}else{
double weight = vsmElem.getWeight();
this.addElement(new VSMElement(word,weight+1));
}
}
}
for (int i = 0; i < titleWords.length; i++) {
String word = titleWords[i].toLowerCase();
word = stemmer.stem(word);
if(word.indexOf("No term") != -1){
continue;
}
if(word.length() > 2 ){
word = "title" + word;
VSMElement vsmElem = this.getElement(word);
if(vsmElem == null){
this.addElement(new VSMElement(word,1));
}else{
double weight = vsmElem.getWeight();
this.addElement(new VSMElement(word,weight+1));
}
}
}
}
public VSMVector(String document, StopList stoplist, boolean stem) throws MalformedURLException {
if(!document.contains("<html>")){
document = "<html> " + document + " </html>";
}
this.stoplist = stoplist;
this.elems = new HashMap<>();
PaginaURL page = new PaginaURL(new URL("http://www"),document, stoplist);
addTitle(page,stoplist);
if(stem){
stemPage(page,false);
}else{
String[] words = page.palavras();
int[] frequencies = page.ocorrencias();
for (int i = 0; i < words.length; i++) {
if(frequencies[i] == 0){
continue;
}
// if(words[i].length() > 2 ){
VSMElement vsmElem = this.getElement(words[i]);
if(vsmElem == null){
this.addElement(new VSMElement(words[i],1));
}else{
double weight = vsmElem.getWeight();
this.addElement(new VSMElement(words[i],1+weight));
}
// }
}
}
}
public String getId(){
return this.id;
}
public VSMVector(String id, String document, StopList stoplist) throws
MalformedURLException {
this(document,stoplist);
this.id = id;
}
public VSMVector(String document, StopList stoplist) throws MalformedURLException {
if(!document.contains("<html>")){
document = "<html> " + document + " </html>";
}
this.stoplist = stoplist;
this.elems = new HashMap<>();
PaginaURL page = new PaginaURL(new URL("http://www"), document, stoplist);
addTitle(page, stoplist);
stemPage(page, false);
}
public VSMVector(PaginaURL page, StopList stoplist) throws MalformedURLException {
this.stoplist = stoplist;
this.elems = new HashMap<>();
stemPage(page, false);
}
public VSMVector(String []words, StopList stoplist) throws MalformedURLException, IOException, SAXException {
this.stoplist = stoplist;
String word;
for (int i = 0; i < words.length; i++) {
word = stemmer.stem(words[i]);
VSMElement vsmElem = this.getElement(word);
if(vsmElem == null){
this.addElement(new VSMElement(word, 1));
}else{
double weight = vsmElem.getWeight();
this.addElement(new VSMElement(word, 1+weight));
}
}
}
public void addElements(String []words) {
for (int i = 0; i < words.length; i++) {
this.addElement(words[i]);
}
}
public void addElement(String word) {
this.addElement(new VSMElement(word, 1));
}
public void addElement(VSMElement elem){
// if(!stoplist.eIrrelevante(elem.getWord())){
// word = stemmer.stem(word);
VSMElement vsmElem = this.getElement(elem.getWord());
if(vsmElem == null){
elems.put(elem.getWord(), elem);
}else{
double weight = vsmElem.getWeight();
elems.put(elem.getWord(),new VSMElement(elem.getWord(), elem.getWeight()+weight));
}
// }
}
public VSMElement getElement(String word){
return elems.get(word);
}
public Iterator<VSMElement> getElements(){
return elems.values().iterator();
}
public VSMElement[] getArrayElements(){
VSMElement[] elementsTemp = new VSMElement[elems.size()];
Iterator<VSMElement> iterator = elems.values().iterator();
int count = 0;
while (iterator.hasNext()) {
elementsTemp[count] = iterator.next();
count++;
}
return elementsTemp;
}
public int size(){
return elems.size();
}
public void addDFs(HashMap<String, Integer> idfs){
Iterator<String> iter = elems.keySet().iterator();
while(iter.hasNext()){
VSMElement elem = elems.get(iter.next());
if(elem != null){
String term = elem.getWord();
double freq = elem.getWeight();
Integer df = idfs.get(term);
if(df == null){
elem.setWeight(0);
}else{
double weight = freq / df.doubleValue();
elem.setWeight(weight);
}
}
}
}
public double vectorSpaceSimilarityIDF(VSMVector vectorB, HashMap<String, Integer> idfs){
VSMVector vectorA = this;
double denominatorA = 0;
double denominatorB = 0;
VSMElement elem = null;
Iterator<VSMElement> iterA = vectorA.getElements();
while(iterA.hasNext()){
elem = iterA.next();
if((Integer)idfs.get(elem.getWord()) != null){
int idf = idfs.get(elem.getWord()).intValue();
double weight = elem.getWeight()*Math.log((double)idfs.size()/(double)idf);
denominatorA = denominatorA + (weight*weight);
}
}
Iterator<VSMElement> iterB = vectorB.getElements();
while(iterB.hasNext()){
elem = iterB.next();
if((Integer)idfs.get(elem.getWord()) != null){
int idf = ( (Integer) idfs.get(elem.getWord())).intValue();
double weight = elem.getWeight() *
Math.log( (double) idfs.size() / (double) idf);
denominatorB = denominatorB + (weight * weight);
}
}
double numerator = 0;
iterA = vectorA.getElements();
while(iterA.hasNext()){
VSMElement elemA = iterA.next();
VSMElement elemB = vectorB.getElement(elemA.getWord());
if( elemB != null){
if(idfs.get(elemA.getWord()) != null){
int idf = ( (Integer) idfs.get(elemA.getWord())).intValue();
double weightA = elemA.getWeight() * Math.log( (double) idfs.size() / (double) idf);
double weightB = elemB.getWeight() * Math.log( (double) idfs.size() / (double) idf);
numerator = numerator + weightA*weightB;
}
}
}
double weight = numerator/(Math.sqrt(denominatorA)*Math.sqrt(denominatorB));
return weight;
}
public double jaccardSimilarity(VSMVector vectorB){
VSMVector vectorA = this;
double numerator = 0;
Iterator<VSMElement> iterA = vectorA.getElements();
while(iterA.hasNext()){
VSMElement elemA = iterA.next();
VSMElement elemB = vectorB.getElement(elemA.getWord());
if( elemB != null){ //overlap
// numerator = numerator + elemA.getWeight()*elemB.getWeight();
numerator = numerator + 1;
}
}
double denominator = vectorA.size() + vectorB.size() - numerator;
// System.out.println("NUMERATOR:"+numerator);
// System.out.println("NUMERATOR:"+denominator);
return numerator/denominator;
}
public double intersection(VSMVector vectorB){
VSMVector vectorA = this;
double numerator = 0;
Iterator<VSMElement> iterA = vectorA.getElements();
while(iterA.hasNext()){
VSMElement elemA = iterA.next();
VSMElement elemB = vectorB.getElement(elemA.getWord());
if( elemB != null){ //overlap
// numerator = numerator + elemA.getWeight()*elemB.getWeight();
numerator = numerator + 1;
}
}
return numerator;
}
public VSMVector clone(){
VSMVector res = new VSMVector(stoplist);
Iterator<VSMElement> iter = this.getElements();
while(iter.hasNext()){
VSMElement tempElem = iter.next();
res.addElement(tempElem);
}
return res;
}
public double vectorSpaceSimilarity(VSMVector vectorB){
VSMVector vectorA = this;
double denominatorA = 0;
double denominatorB = 0;
VSMElement elem = null;
Iterator<VSMElement> iterA = vectorA.getElements();
while(iterA.hasNext()){
elem = iterA.next();
double weight = elem.getWeight();
denominatorA = denominatorA + (weight*weight);
}
if(denominatorA == 0){
return 0;
}
Iterator<VSMElement> iterB = vectorB.getElements();
while(iterB.hasNext()){
elem = iterB.next();
double weight = elem.getWeight();
denominatorB = denominatorB + (weight * weight);
}
if(denominatorB == 0){
return 0;
}
double numerator = 0;
iterA = vectorA.getElements();
while(iterA.hasNext()){
VSMElement elemA = iterA.next();
VSMElement elemB = vectorB.getElement(elemA.getWord());
if( elemB != null){
double weightA = elemA.getWeight();
double weightB = elemB.getWeight();
numerator = numerator + weightA*weightB;
}
}
double den = (Math.sqrt(denominatorA)*Math.sqrt(denominatorB));
double weight = numerator/den;
// if(weight > 0.52 && weight < 0.53){
// System.out.println("A:" + vectorA.toString());
// System.out.println("B:" + vectorB.toString());
// System.out.println("NUMERATOR:"+numerator);
// System.out.println("DENOMINA:"+denominatorA);
// System.out.println("DENOMINB:"+denominatorB);
// System.out.println("DENOMIN:"+den);
// System.out.println("SIM:"+weight);
// }
return weight;
}
public void addVector(VSMVector pageVector){
VSMVector centroidVector = this;
Iterator<VSMElement> iter = pageVector.getElements();
while(iter.hasNext()){
VSMElement tempElem = iter.next();
centroidVector.addElement(tempElem);
}
}
public void multiplyWeights(double factor){
Iterator<String> iter = elems.keySet().iterator();
while(iter.hasNext()){
String word = (String)iter.next();
VSMElement elem = elems.get(word);
elems.put(word,new VSMElement(word,elem.getWeight()*factor));
}
}
public void negativeVector(){
Iterator<String> iter = elems.keySet().iterator();
while(iter.hasNext()){
String word = (String)iter.next();
VSMElement elem = elems.get(word);
elems.put(word,new VSMElement(word,-elem.getWeight()));
}
}
public static HashMap<String, Integer> calculateIDFs(VSMVector[] vectors) throws IOException,
SAXException {
HashMap<String, Integer> idfs = new HashMap<>();
for (int i = 0; i < vectors.length; i++) {
VSMVector pageVector = vectors[i];
Iterator<VSMElement> iter = pageVector.getElements();
while(iter.hasNext()){
String word = iter.next().getWord();
Integer ocur = (Integer)idfs.get(word);
if( ocur == null){
idfs.put(word,new Integer(1));
}else{
idfs.put(word,new Integer(ocur.intValue()+1));
}
}
}
return idfs;
}
public HashMap<String, Integer> calculateWordOccurence(VSMVector[] vectors) throws IOException,
SAXException {
HashMap<String, Integer> idfs = new HashMap<>();
for (int i = 0; i < vectors.length; i++) {
VSMVector pageVector = vectors[i];
Iterator<VSMElement> iter = pageVector.getElements();
while(iter.hasNext()){
VSMElement elem = iter.next();
String word = elem.getWord();
Integer ocur = (Integer)idfs.get(word);
if( ocur == null){
idfs.put(word,new Integer((int)elem.getWeight()));
}else{
idfs.put(word,new Integer(ocur.intValue()+(int)elem.getWeight()));
}
}
}
return idfs;
}
private void stemPage(PaginaURL page, boolean isForm){
String[] words = page.palavras();
int[] frequencies = page.ocorrencias();
for (int i = 0; i < words.length; i++) {
String word = null;
try{
frequencies[i] = 1;
word = stemWord(words[i]);
if(word == null){
continue;
}
}catch(Exception e){
continue;
}
if(frequencies[i] == 0){
continue;
}
if(word.length() > 2 ){
VSMElement vsmElem = this.getElement(word);
if(vsmElem == null){
this.addElement(new VSMElement(word,frequencies[i]));
}else{
double weight = vsmElem.getWeight();
this.addElement(new VSMElement(word,frequencies[i]+weight));
}
}
}
}
private String stemWord(String word){
if(word.indexOf("font-") != -1 || word.indexOf("padding") != -1
|| word.indexOf("border") != -1 || word.indexOf("margin") != -1
|| word.indexOf("background") != -1 || word.indexOf("color") != -1
|| word.indexOf("width") != -1 || word.indexOf("field") != -1
|| word.indexOf("verdana") != -1 || word.indexOf("helvetica") != -1
|| word.indexOf("sans") != -1 || word.indexOf("arial") != -1){ //parser bug
return null;
}
word = stemmer.stem(word);
if(word.indexOf("No term") != -1){
return null;
}
return word;
}
public void normalizebyMax(){
VSMElement[] topElems = topElements(1);
double max = topElems[0].getWeight();
double total = 0;
for(VSMElement elem : elems.values()) {
total = total + elem.getWeight();
}
if(total != 0){
for(Map.Entry<String, VSMElement> entry : elems.entrySet()) {
VSMElement elem = entry.getValue();
elems.put(entry.getKey(),new VSMElement(entry.getKey(),elem.getWeight()/max));
}
}
}
public void normalize(){
double total = 0;
for(VSMElement elem : elems.values()) {
total = total + elem.getWeight();
}
if(total != 0){
for(Map.Entry<String, VSMElement> entry : elems.entrySet()) {
VSMElement elem = entry.getValue();
elems.put(entry.getKey(),new VSMElement(entry.getKey(),elem.getWeight()/total));
}
}
}
public void squaredNormalization(){
double total = 0;
for(VSMElement elem : elems.values()) {
total = total + Math.sqrt(elem.getWeight());
}
if(total != 0){
for(Map.Entry<String, VSMElement> entry : elems.entrySet()) {
VSMElement elem = entry.getValue();
elems.put(entry.getKey(),new VSMElement(entry.getKey(),Math.sqrt(elem.getWeight())/total));
}
}
}
public void normalizeOverElements() {
double total = elems.size();
for(Map.Entry<String, VSMElement> entry : elems.entrySet()) {
VSMElement elem = entry.getValue();
elems.put(entry.getKey(), new VSMElement(entry.getKey(), elem.getWeight() / total));
}
}
public VSMElement[] topElements(int n){
Vector<VSMElement> temp = new Vector<>();
for(VSMElement elem : elems.values()) {
temp.add(elem);
}
Collections.sort(temp,new VSMElementComparator());
VSMElement[] res = new VSMElement[n];
for (int i = 0; i < temp.size() && i < n; i++) {
res[i] = temp.elementAt(i);
}
return res;
}
public String toString(){
StringBuffer buf = new StringBuffer();
Vector<VSMElement> temp = new Vector<>();
Iterator<VSMElement> iter = elems.values().iterator();
while(iter.hasNext()){
VSMElement elem = iter.next();
temp.add(elem);
}
Collections.sort(temp,new VSMElementComparator());
buf.append("[");
for (int i = 0; i < temp.size(); i++) {
VSMElement elem = temp.elementAt(i);
buf.append(elem.toString());
buf.append(",");
}
buf.append("]");
return buf.toString();
}
public void remove(String word){
elems.remove(word);
}
private void parse(Node node, StringBuffer source, StringBuffer sourceTemp, String father,StopList stoplist) {
// System.out.println(node.getClass().getName());
// System.out.println("Name "+ node.getNodeName());
// System.out.println("Type "+ node.getNodeType());
// System.out.println("Value "+ node.getNodeValue());
if(node == null){
return;
}
String value = node.getNodeValue() + " of";
if(Node.TEXT_NODE == node.getNodeType()){
if(value.trim().indexOf("<") == -1){
PaginaURL pageTemp = null;
String[] words = null;
try {
pageTemp = new PaginaURL(new URL("http://www"),value, stoplist);
words = pageTemp.palavras();
}
catch (MalformedURLException ex) {
}
for(int i = 0; words != null && i < words.length; i++){
// String stem = stemmer.stem(words[i]);
// if(stem.equals("Invalid term")){
// stem = words[i];
// }
// if(stem.indexOf("check") != -1){
// stem = "check";
// }
//
// if(!father.equals("OPTION")){
// source.append("body");
// source.append(stem);
// source.append(" ");
// }else{
// source.append(stem);
// source.append(" ");
// }
String stem = words[i];
try{
stem = stemmer.stem(words[i]);
}catch(Exception e){
}
if(!father.equals("OPTION")){
if(stem.equals("Invalid term")){
stem = words[i];
}
if(stem.indexOf("check") != -1){
stem = "check";
}
source.append("body");
source.append(stem);
source.append(" ");
}else{
source.append(stem);
source.append(" ");
}
}
}
return;
}
if(node.getNodeName().equals("INPUT")){
NamedNodeMap attrs = node.getAttributes();
for (int i = 0; i < attrs.getLength(); i++) {
Node attr = attrs.item(i);
String attrName = ((attr.getNodeName().trim()).toLowerCase());
String attrValue = ((attr.getNodeValue().trim()).toLowerCase());
if(attrName.equals("type") && !attrValue.equals("hidden")){
source = source.append(sourceTemp);
sourceTemp.delete(0,sourceTemp.length());
}
}
}
father = node.getNodeName();
NodeList children = node.getChildNodes();
if (children != null) {
int len = children.getLength();
for (int i = 0; i < len; i++){
parse(children.item(i),source,sourceTemp, father, stoplist);
}
}
}
}