package org.xbib.elasticsearch.index.analysis.symbolname; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.PackedTokenAttributeImpl; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import java.io.IOException; import java.nio.charset.CharacterCodingException; import java.util.Collection; import java.util.Collections; import java.util.LinkedList; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * */ public class SymbolnameTokenFilter extends TokenFilter { private static final Pattern pattern = Pattern.compile("\\P{L}", Pattern.UNICODE_CHARACTER_CLASS); private final LinkedList<PackedTokenAttributeImpl> tokens; private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); private State current; protected SymbolnameTokenFilter(TokenStream input) { super(input); this.tokens = new LinkedList<>(); } @Override public final boolean incrementToken() throws IOException { if (!tokens.isEmpty()) { if (current == null) { throw new IllegalArgumentException("current is null"); } PackedTokenAttributeImpl token = tokens.removeFirst(); restoreState(current); termAtt.setEmpty().append(token); posIncAtt.setPositionIncrement(0); return true; } if (input.incrementToken()) { process(); if (!tokens.isEmpty()) { current = captureState(); } return true; } else { return false; } } @Override public void reset() throws IOException { super.reset(); tokens.clear(); current = null; } protected void process() throws CharacterCodingException { String term = new String(termAtt.buffer(), 0, termAtt.length()); for (CharSequence charSequence : process(term)) { if (charSequence != null) { PackedTokenAttributeImpl token = new PackedTokenAttributeImpl(); token.append(charSequence); tokens.add(token); } } } protected Collection<CharSequence> process(String term) { Collection<CharSequence> variants = new LinkedList<>(); StringBuffer sb = new StringBuffer(); Matcher m = pattern.matcher(term); while (m.find()) { String symbol = m.group(); Character ch = symbol.charAt(0); String symbolname = " __" + Character.getName(ch).toUpperCase() .replaceAll("\\s", "").replaceAll("\\-", "") + "__"; m.appendReplacement(sb, symbolname); } m.appendTail(sb); String variant = sb.toString().trim(); if (!variant.equals(term)) { variants.add(variant); if (variant.indexOf(' ') >= 0) { Collections.addAll(variants, variant.split("\\s")); } } return variants; } @Override public boolean equals(Object object) { return object instanceof SymbolnameTokenFilter; } @Override public int hashCode() { return 0; } }