package com.transmem.nlp;
import java.io.FileReader;
import java.io.BufferedReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Hashtable;
import java.util.logging.Logger;
public class GenericFilter implements IFilter
{
private Logger log_ = Logger.getLogger(GenericFilter.class.getName());
private Hashtable<String,String> stoplist_ = new Hashtable<String,String>();
private int minchars_ = 2;
private String punctmarks_ = "";
protected void loadStoplist(String filespec) throws IOException
{
BufferedReader br = null;
try
{
br = new BufferedReader(new FileReader(filespec));
String line = null;
while ((line = br.readLine())!=null)
{
stoplist_.put(line,line);
}
}
catch (IOException ioe)
{
log_.severe("GenericFilter.loadStoplist('"+filespec+"') IOException:"+ioe);
throw new IOException(ioe.getMessage());
}
finally
{
if (br != null)
try { br.close(); } catch (IOException e) {}
}
}
protected int getMinChars()
{
return this.minchars_;
}
protected void setMinChars(int minchars)
{
this.minchars_ = minchars;
}
protected String getPuctuationMarks()
{
return this.punctmarks_;
}
protected void setPunctuationMarks(String marks)
{
this.punctmarks_ = marks;
}
public boolean isStopWord(String word)
{
return stoplist_.containsKey(word);
}
public String[] filter(String[] words)
{
ArrayList<String> ws = new ArrayList<String>(words.length);
for (int i=0; i<words.length; i++)
{
if (words[i].length() < this.minchars_)
continue;
if (this.punctmarks_.indexOf(words[i])>=0)
continue;
if (Character.isDigit(words[i].charAt(0)))
continue;
if (!stoplist_.containsKey(words[i]))
{
ws.add(words[i]);
}
}
return ws.toArray(new String[ws.size()]);
}
}