/* ############################################################################ ## ## Copyright (C) 2006-2009 University of Utah. All rights reserved. ## ## This file is part of DeepPeep. ## ## This file may be used under the terms of the GNU General Public ## License version 2.0 as published by the Free Software Foundation ## and appearing in the file LICENSE.GPL included in the packaging of ## this file. Please review the following to ensure GNU General Public ## Licensing requirements will be met: ## http://www.opensource.org/licenses/gpl-license.php ## ## If you are unsure which license is appropriate for your use (for ## instance, you are interested in developing a commercial derivative ## of DeepPeep), please contact us at deeppeep@sci.utah.edu. ## ## This file is provided AS IS with NO WARRANTY OF ANY KIND, INCLUDING THE ## WARRANTY OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. ## ############################################################################ */ /* * @(#)StopListArquivo.java * * Copyright (c) 1997-1999 Departamento de Informática - UFPE * Grupo: * Luciano de A. Barbosa (lab) * Oscar G. de Miranda (ogm) * Thiago L.V.L. Santos (tlvls) * Flavio Couto (frco) */ package focusedCrawler.util.string; import java.io.BufferedReader; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; @SuppressWarnings("serial") public class StopListFile extends AbstractStopList { public static StopListFile DEFAULT; private static String MARCADOR_EXCECOES = "*** Excecoes"; private static String MARCADOR_IRRELEVANTES = "*** Palavras Irrelevantes"; private static String MARCADOR_COMPLEMENTARES = "*** Palavras Complementares"; private static String MARCADOR_PREFIXOS = "*** Prefixos"; private static String MARCADOR_SUFIXOS = "*** Terminacoes Ignoraveis"; final int INICIO = 1; final int EXCECOES = INICIO + 1; final int IRRELEVANTES = EXCECOES + 1; final int COMPLEMENTARES = IRRELEVANTES + 1; final int PREFIXOS = COMPLEMENTARES + 1; final int SUFIXOS = PREFIXOS + 1; static { String filename = "stopwords.en.txt"; try (InputStream f = StopListFile.class.getClassLoader().getResourceAsStream(filename)) { DEFAULT = new StopListFile(f); } catch (IOException e) { throw new RuntimeException("Failed to load stopwords file"); } } public StopListFile(InputStream... files) throws IOException { parseFiles(files); } public StopListFile(String... filenames) throws IOException { InputStream[] files = new InputStream[filenames.length]; for (int i = 0; i < filenames.length; i++) { files[i] = new FileInputStream(filenames[i]); } parseFiles(files); } public void parseFiles(InputStream... files) throws IOException { List<String> excecoes = new ArrayList<>(); List<String> irrelevantes = new ArrayList<>(); List<String> complementares = new ArrayList<>(); List<String> prefixos = new ArrayList<>(); List<String> sufixos = new ArrayList<>(); for(int i=0; i < files.length; i++) { List<String> lines = readLines(files[i]); int estado = INICIO; for (String temp : lines) { temp = temp.trim(); if (temp.length() > 0 && !temp.startsWith("#")) { if (temp.startsWith(MARCADOR_EXCECOES)) estado = EXCECOES; else if (temp.startsWith(MARCADOR_IRRELEVANTES)) estado = IRRELEVANTES; else if (temp.startsWith(MARCADOR_COMPLEMENTARES)) estado = COMPLEMENTARES; else if (temp.startsWith(MARCADOR_PREFIXOS)) estado = PREFIXOS; else if (temp.startsWith(MARCADOR_SUFIXOS)) estado = SUFIXOS; else { switch (estado) { case EXCECOES: excecoes.add(temp); break; case IRRELEVANTES: irrelevantes.add(temp); break; case COMPLEMENTARES: complementares.add(temp); break; case PREFIXOS: prefixos.add(temp); break; case SUFIXOS: sufixos.add(temp); break; } } } } } if (excecoes.size() > 0) { String tmp[] = new String[excecoes.size()]; excecoes.toArray(tmp); super.setExcecoes(tmp); // modifica o array da super classe que era nulo } if (irrelevantes.size() > 0) { String tmp[] = new String[irrelevantes.size()]; irrelevantes.toArray(tmp); super.setIrrelevantes(tmp); // modifica o array da super classe que era nulo } if (complementares.size() > 0) { String tmp[] = new String[complementares.size()]; complementares.toArray(tmp); super.setComplementares(tmp); // modifica o array da super classe que era nulo } if (prefixos.size() > 0) { String tmp[] = new String[prefixos.size()]; prefixos.toArray(tmp); super.setPrefixos(tmp); // modifica o array da super classe que era nulo } if (sufixos.size() > 0) { String tmp[] = new String[sufixos.size()]; sufixos.toArray(tmp); super.setSufixos(tmp); // modifica o array da super classe que era nulo } } private List<String> readLines(InputStream fileStream) throws IOException, FileNotFoundException { if(fileStream == null) { throw new IllegalArgumentException("Input stream can't be nul;"); } List<String> lines = new ArrayList<>(); try (BufferedReader in = new BufferedReader(new InputStreamReader(fileStream)) ) { for (String temp = in.readLine(); temp != null; temp = in.readLine()) { lines.add(temp); } } return lines; } }