/* * NormalizingTermProcessor.java * * Copyright (c) 2012, The University of Sheffield. * * This file is part of GATE Mímir (see http://gate.ac.uk/family/mimir.html), * and is free software, licenced under the GNU Lesser General Public License, * Version 3, June 2007 (also included with this distribution as file * LICENCE-LGPL3.html). * * Mark A. Greenwood, 5 January 2012 * * $Id$ */ package gate.mimir.util; import it.unimi.dsi.lang.MutableString; import it.unimi.di.big.mg4j.index.Index; import it.unimi.di.big.mg4j.index.TermProcessor; import java.io.InputStream; import java.lang.reflect.Method; import java.lang.reflect.UndeclaredThrowableException; import java.text.Normalizer; public class NormalizingTermProcessor implements TermProcessor { private final static NormalizingTermProcessor INSTANCE = new NormalizingTermProcessor(); public final static TermProcessor getInstance() { return INSTANCE; } private NormalizingTermProcessor() { //nothing to do but this method forces people to use the getInstance() method; } @Override public boolean processTerm(final MutableString term) { if (term == null) return false; term.toLowerCase(); //http://glaforge.appspot.com/article/how-to-remove-accents-from-a-string term.replace(Normalizer.normalize(term, Normalizer.Form.NFD).replaceAll("\\p{InCombiningDiacriticalMarks}+", "")); return true; } @Override public boolean processPrefix(final MutableString prefix) { return processTerm(prefix); } private Object readResolve() { return INSTANCE; } @Override public String toString() { return this.getClass().getName(); } @Override public NormalizingTermProcessor copy() { return this; } }