/*
############################################################################
##
## Copyright (C) 2006-2009 University of Utah. All rights reserved.
##
## This file is part of DeepPeep.
##
## This file may be used under the terms of the GNU General Public
## License version 2.0 as published by the Free Software Foundation
## and appearing in the file LICENSE.GPL included in the packaging of
## this file. Please review the following to ensure GNU General Public
## Licensing requirements will be met:
## http://www.opensource.org/licenses/gpl-license.php
##
## If you are unsure which license is appropriate for your use (for
## instance, you are interested in developing a commercial derivative
## of DeepPeep), please contact us at deeppeep@sci.utah.edu.
##
## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE
## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
##
############################################################################
*/
package focusedCrawler.link.classifier.builder;
import java.util.Collections;
import java.util.Vector;
import java.util.HashSet;
public class FilterData {
private int maxElements;
private int maxWordSize;
public FilterData(int maxElements, int maxWordSize) {
this.maxElements = maxElements;
this.maxWordSize = maxWordSize;
}
public Vector<WordFrequency> filter(Vector<WordFrequency> sortList, Vector<WordFrequency> aroundWords){
boolean stem = aroundWords != null;
Vector<WordFrequency> filteredWords = new Vector<>();
int minFreq = 5;
int count = 0;
int i = 0;
int maxFreq = ((WordFrequency)sortList.firstElement()).getFrequency();
if(maxFreq <= 10){
minFreq = 1;
}
while (i < sortList.size() && (count < maxElements || stem)) {
WordFrequency wordFrequency = (WordFrequency)sortList.elementAt(i);
String word = wordFrequency.getWord();
int frequency = wordFrequency.getFrequency();
if(word.length() > maxWordSize){
if(frequency > minFreq){
filteredWords.add(wordFrequency);
count++;
}
}
i++;
}
if(stem){
Vector<WordFrequency> stemmedWords = stemming(filteredWords, aroundWords, sortList);
Vector<WordFrequency> result = new Vector<>();
for (int j = 0; j < maxElements && j < stemmedWords.size(); j++) {
result.add(stemmedWords.elementAt(j));
}
return result;
}else{
return filteredWords;
}
}
public Vector<WordFrequency> filter(Vector<WordFrequency> sortList){
Vector<WordFrequency> filteredWords = new Vector<>();
int count = 0;
int i = 0;
while (i < sortList.size() && count < maxElements) {
WordFrequency wordFrequency = (WordFrequency)sortList.elementAt(i);
String word = wordFrequency.getWord();
if(word.length() > maxWordSize){
filteredWords.add(wordFrequency);
count++;
}
i++;
}
return stemming(filteredWords, null, sortList);
}
private Vector<WordFrequency> stemming(Vector<WordFrequency> wordFreqList, Vector<WordFrequency> aroundWords, Vector<WordFrequency> initialList){
Vector<WordFrequency> finalWords = new Vector<>();
HashSet<String> usedWords = new HashSet<>();
if(aroundWords != null){
for (int i = 0; i < aroundWords.size(); i++) {
boolean exist = false;
WordFrequency firstWordFreq = (WordFrequency)aroundWords.elementAt(i);
String word = firstWordFreq.getWord();
firstWordFreq = new WordFrequency(word,0);
for (int j = 0; j < wordFreqList.size(); j++) {
WordFrequency wordFreqTemp = (WordFrequency) wordFreqList.elementAt(j);
if(word.equals(wordFreqTemp.getWord())){
exist = true;
break;
}
if (!usedWords.contains(word) && wordFreqTemp.getWord() != null &&
wordFreqTemp.getWord().indexOf(word) != -1) {
usedWords.add(wordFreqTemp.getWord());
firstWordFreq.incrementFrequncy(wordFreqTemp.getFrequency());
}
}
if(!exist){
wordFreqList.add(firstWordFreq);
}
}
}
Collections.sort(wordFreqList, new WordSizeComparator());
for (int i = 0; i < wordFreqList.size(); i++) {
WordFrequency firstWordFreq = (WordFrequency)wordFreqList.elementAt(i);
String word = firstWordFreq.getWord();
if(word != null &&
(usedWords.contains(word) ||
(word.equals("net") || word.equals("http") || word.equals("www") || word.equals("cfm") || word.equals("cgi") ||
word.equals("asp") || word.equals("php") || word.equals("jsp")))){//test
continue;
}
if(word != null){
for (int j = 0; j < initialList.size(); j++) {
WordFrequency wordFreqTemp = (WordFrequency) initialList.elementAt(j);
if (wordFreqTemp.getWord() != null &&
wordFreqTemp.getWord().indexOf(word) != -1) {
usedWords.add(wordFreqTemp.getWord());
firstWordFreq.incrementFrequncy(wordFreqTemp.getFrequency());
}
}
if(firstWordFreq.getFrequency() > 0){
finalWords.add(firstWordFreq);
}
}
}
Collections.sort(finalWords, new WordFrequencyComparator());
return finalWords;
}
}