/* ############################################################################ ## ## Copyright (C) 2006-2009 University of Utah. All rights reserved. ## ## This file is part of DeepPeep. ## ## This file may be used under the terms of the GNU General Public ## License version 2.0 as published by the Free Software Foundation ## and appearing in the file LICENSE.GPL included in the packaging of ## this file. Please review the following to ensure GNU General Public ## Licensing requirements will be met: ## http://www.opensource.org/licenses/gpl-license.php ## ## If you are unsure which license is appropriate for your use (for ## instance, you are interested in developing a commercial derivative ## of DeepPeep), please contact us at deeppeep@sci.utah.edu. ## ## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE ## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ## ############################################################################ */ /* * @(#)AbstractStopList.java * * Copyright (c) 1997-1999 Departamento de Inform�tica - UFPE * Grupo: * Luciano de A. Barbosa (lab) * Oscar G. de Miranda (ogm) * Thiago L.V.L. Santos (tlvls) * Flavio Couto (frco) */ package focusedCrawler.util.string; @SuppressWarnings("serial") public abstract class AbstractStopList implements StopList { protected String excecoes[] = null; protected String irrelevantes[] = null; protected String complementares[] = null; protected String prefixos[] = null; protected String sufixos[] = null; protected static final int MIN_LENGTH = 2; protected static final int MAX_LENGTH = 30; protected static final int MIN_NUM_LENGTH = 2; protected static final int MAX_NUM_LENGTH = 30; public AbstractStopList() { this(new String[0], new String[0], new String[0], new String[0], new String[0], true); } public AbstractStopList(String excecoes[], String irrelevantes[]) { this(excecoes, irrelevantes, new String[0], new String[0], new String[0], true); } public AbstractStopList(String excecoes[], String irrelevantes[], String complementares[]) { this(excecoes, irrelevantes, complementares, new String[0], new String[0], true); } public AbstractStopList( String excecoes[],String irrelevantes[],String complementares[],String prefixos[],String sufixos[] ) { this(excecoes, irrelevantes, complementares, prefixos, sufixos, true); } public AbstractStopList( String excecoes[],String irrelevantes[],String complementares[],String prefixos[],String sufixos[],boolean realizarQuickSort ) { this.excecoes = excecoes; this.irrelevantes = irrelevantes; this.complementares = complementares; this.prefixos = prefixos; this.sufixos = sufixos; if( realizarQuickSort ) realizarQuickSort(); } protected String[] getExcecoes() { return excecoes; } protected void setExcecoes( String array[] ) { excecoes = array; if( excecoes != null ) quickSortString( excecoes ); } protected String[] getIrrelevantes() { return irrelevantes; } protected void setIrrelevantes( String array[] ) { irrelevantes = array; if( irrelevantes != null ) quickSortString( irrelevantes ); } protected void setComplementares( String array[] ) { complementares = array; if( complementares != null ) quickSortString( complementares ); } protected void setPrefixos( String array[] ) { prefixos = array; if( prefixos != null ) quickSortString( prefixos ); } protected void setSufixos( String array[] ) { sufixos = array; if( sufixos != null ) quickSortString( sufixos ); } protected void realizarQuickSort() { if( excecoes != null ) quickSortString( excecoes ); if( irrelevantes != null ) quickSortString( irrelevantes ); if( complementares != null ) quickSortString( complementares ); if( prefixos != null ) quickSortString( prefixos ); if( sufixos != null ) quickSortString( sufixos ); } protected void quickSortString( String str[] ) { quicksort_str( str,0,str.length-1 ); } protected void quicksort_str( String str[],int left,int right ) { String pivot; int l = left; int r = right; if(left<right) { pivot = str[(left+right)/2]; while(l<=r) { while( str[l].compareTo( pivot ) < 0 & l < right ) l++; while( str[r].compareTo( pivot ) > 0 & r > left ) r--; if(l<=r) { troque( str,l,r ); l++; r--; } } if(left<r) quicksort_str( str,left,r ); if(l<right) quicksort_str( str,l,right ); } } // Funcao auxiliar do quicksort protected void troque( String str[],int l,int r ) { String temp; temp = str[l]; str[l] = str[r]; str[r] = temp; } public boolean isStopWord(String word){ boolean isStop = false; word = word.trim(); int size = word.length(); if( size < MIN_LENGTH || size > MAX_LENGTH ){ return true; } if(pertenceAoArray(word,irrelevantes)){ return true; } char first = word.charAt(0); if(eNumero(first) || first < 48 || (first > 57 && first < 65) || (first > 90 && first < 97) || (first > 123 && first < 128)){ return true; } return isStop; } public boolean isIrrelevant( String palavra) { if( palavra == null ) return true; palavra = palavra.trim(); int size = palavra.length(); if( pertenceAoArray( palavra, excecoes )) return false; if( size < MIN_LENGTH || size > MAX_LENGTH ) return true; // char first = palavra.charAt(0); // if( !(eNumero(first) || eLetra(first)) ) // // return true; // // char last = palavra.charAt(size-1); // // if( !(eNumero(last) || eLetra(last)) ) // // return true; if( apenasNumero(palavra) && (size < MIN_NUM_LENGTH || size > MAX_NUM_LENGTH) ) return true; if( apenasHifen(palavra) ) return true; if( apenasNumeroEHifen(palavra) ) return true; if( possuiCaracteresIrrelevantes( palavra ) ) return true; if( pertenceAoArray( palavra,irrelevantes ) ) return true; if( pertenceAoArray( palavra,complementares ) ) return true; if( possuiPrefixos( palavra,prefixos ) ) return true; if( possuiSufixos( palavra,sufixos ) ) return true; return false; } /** * Indica se uma string e formada apenas por numeros. */ protected boolean apenasNumero( String palavra ) { boolean numero = true; int i = 0; char c; int size = palavra.length(); while( i < size && numero ) { c = palavra.charAt(i); numero = numero && eNumero(c); i++; } return numero; } /** * Indica se uma string e formada apenas por hifens. */ protected boolean apenasHifen( String palavra ) { boolean hifen = true; int i = 0; char c; int size = palavra.length(); while( i < size && hifen ) { c = palavra.charAt(i); hifen = hifen && (c == '-'); i++; } return hifen; } /** * Tenta identificar um conjunto de caracteres, no caso os numeros. * >48 e <57 para 0..9 */ protected boolean eNumero( char c ) { int value = (int) c; return ( 48 <= value && value <= 57 ); } /** * Tenta identificar um conjunto de caracteres, no caso as letras do alfabeto. * >65 e <90 para a..z * >97 e <122 para A..Z */ protected boolean eLetra( char c ) { int value = (int)(Acentos.retirarAcentosANSI(""+c).charAt(0)); return ( 65 <= value && value <= 90 ) || ( 97 <= value && value <= 122 ); } /** * Indica se a palavra e formada apenas por numeros e hifens, sendo assim capaz * de identificar os numero de telefone, CPF, CEP, etc. */ protected boolean apenasNumeroEHifen( String palavra ) { boolean numero = false,hifen = false,outro = false; int i = 0; char c; int size = palavra.length(); while( i < size ) { c = palavra.charAt(i); if( eNumero(c) ) numero = !outro; else if( c == '-' ) hifen = !outro; else { numero = false; hifen = false; outro = true; } i++; } return numero || hifen; } /** * Tenta identifica palavras com caracteres "malucos" deixando apenas as letras, numeros e hifens. * !=45 para ignorar os hifens * <48 para caracteres n�o visualiz�veis e !"#$%&'()*+,./ * >57 e <65 para :;<=>?@ * >90 e <97 para [\]^_` * >122 e <192 para {|}~�������������������������������� al�m de alguns caracteres n�o visualiz�veis */ private boolean possuiCaracteresIrrelevantes(String palavra) { int codigoANSI; int size = palavra.length(); // este size evita que o metodo length() seja chamado a cada loop. for (int i = 0; i < size; i++) { codigoANSI = (int) palavra.charAt(i); if ( codigoANSI != 45 && ( codigoANSI < 48 || ( codigoANSI > 57 && codigoANSI < 65 ) || ( codigoANSI > 90 && codigoANSI < 97 ) || ( codigoANSI > 122 && codigoANSI < 192 ) ) ) return true; } return false; } /** Busca binária em um array de Strings */ protected boolean pertenceAoArray(String palavra, String array[]) { boolean achou = false; if (array != null) { int pos; int left = 0, right = array.length - 1; while (!achou && left <= right) { pos = ((right + left) / 2); if (array[pos].compareTo(palavra) == 0) achou = true; else if (array[pos].compareTo(palavra) < 0) left = pos + 1; else right = pos - 1; } } return achou; } /** Verifica os prefixos. */ protected boolean possuiPrefixos(String palavra, String term[]) { boolean achou = false; if (term != null) { for (int i = 0; i < term.length && !achou; i++) if (palavra.startsWith(term[i])) achou = true; } return achou; } /** Verifica os sufixos. */ protected boolean possuiSufixos( String palavra, String term[] ) { boolean achou = false; if (term != null) { for (int i = 0; i < term.length && !achou; i++) if (palavra.endsWith(term[i])) achou = true; } return achou; } }