package org.apache.lucene.analysis.jate; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import java.io.IOException; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Given a string "(' delete + all ) the symbols.+" * * stripAnySymbols will return "delete all the symbols" * stripLeadingSymbols will return "delete + all ) the symbols.+" * stripTrailingSymbols will return "(' delete + all ) the symbols" * * */ public final class PunctuationRemover extends TokenFilter { public static boolean DEFAULT_STRIP_LEADING_SYMBOLS=false; public static boolean DEFAULT_STRIP_TRAILING_SYMBOLS=false; public static boolean DEFAULT_STRIP_ANY_SYMBOLS=false; protected static Pattern leadingSymbolPattern = Pattern.compile("^[\\p{Punct}]+[\\s]*[\\p{Punct}]*"); protected static Pattern trailingSymbolPattern = Pattern.compile("[\\p{Punct}]*[\\s]*[\\p{Punct}]+$"); //private Pattern leadingSymbolPattern = Pattern.compile("^[\\p{Punct}]+[\\s]*"); //private Pattern trailingSymbolPattern = Pattern.compile("[\\s]*[\\p{Punct}]+$"); private boolean stripLeadingSymbols; private boolean stripTrailingSymbols; private boolean stripAnySymbols; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); /** * Construct a token stream filtering the given input. * * @param input */ protected PunctuationRemover(TokenStream input, boolean stripAnySymbols, boolean stripLeadingSymbols, boolean stripTrailingSymbols) { super(input); this.stripAnySymbols=stripAnySymbols; this.stripLeadingSymbols=stripLeadingSymbols; this.stripTrailingSymbols=stripTrailingSymbols; } @Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String tok = new String(termAtt.buffer(),0, termAtt.length()); tok=tok.trim(); if(tok.length()>0) { String normalised=stripPunctuations(tok, stripAnySymbols, stripLeadingSymbols, stripTrailingSymbols); if(normalised.length()==0) clearAttributes(); else termAtt.setEmpty().append(normalised); } return true; } else { return false; } } public static String stripPunctuations(String tok, boolean stripAnySymbols, boolean stripLeadingSymbols, boolean stripTrailingSymbols){ if (stripAnySymbols) { tok = tok.replaceAll("\\p{Punct}", " ").replaceAll("\\s+", " ").trim(); return tok; } else { if (stripLeadingSymbols) { Matcher m = leadingSymbolPattern.matcher(tok); if (m.find()) tok = tok.substring(m.end()); } if (stripTrailingSymbols) { Matcher m = trailingSymbolPattern.matcher(tok); if (m.find()) tok = tok.substring(0, m.start()); } tok = tok.trim(); return tok; } } }