//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.cleaners; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import org.apache.commons.lang.StringUtils; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.fit.util.JCasUtil; import org.apache.uima.jcas.JCas; import com.google.common.collect.ImmutableSet; import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction; import uk.gov.dstl.baleen.types.semantic.Entity; import uk.gov.dstl.baleen.uima.BaleenAnnotator; /** * Clean up the punctuation in entities * * <p>Cleans punctuation by doing the following:</p> * <ul> * <li>Removing any characters from the start or end of a string that aren't alphanumeric, brackets or currency symbols</li> * <li>Counting brackets and if there is a mismatch, and the offending bracket * appears at the start or end of the string, then it is removed</li> * <li>Removing any entities that are empty following the above actions</li> * </ul> * This annotator handles the value and the covered text of the entity, and will * address them separately if required. * * */ public class CleanPunctuation extends BaleenAnnotator { private static final String ALLOWED_CHARACTERS_START_END = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789()£$€-"; @Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { Collection<Entity> annotations = JCasUtil.select(jCas, Entity.class); Collection<Entity> toRemove = new ArrayList<>(); for (Entity e : annotations) { if (!e.getCoveredText().isEmpty()) { cleanCoveredText(e); if (e.getCoveredText().isEmpty()) { toRemove.add(e); continue; } } if (e.getValue() != null) { e.setValue(cleanValue(e.getValue())); } } removeFromJCasIndex(toRemove); } private void cleanCoveredText(Entity e) { int begin = e.getBegin(); int end = e.getEnd(); // Remove punctuation from the coverText String coverText = e.getCoveredText(); String startStripped = removeStartPunctuation(coverText); String bothStripped = removeEndPunctuation(startStripped); begin = begin + (coverText.length() - startStripped.length()); end = begin + bothStripped.length(); // Remove excessive brackets int openingBrackets = StringUtils.countMatches(bothStripped, "("); int closingBrackets = StringUtils.countMatches(bothStripped, ")"); if (openingBrackets > closingBrackets) { int diff = openingBrackets - closingBrackets; for (int i = 0; i < diff; i++) { if (bothStripped.charAt(i) == '(') { begin += 1; } else { break; } } } else if (openingBrackets < closingBrackets) { int diff = closingBrackets - openingBrackets; for (int i = bothStripped.length() - 1; i > coverText.length() - 1 - diff; i--) { if (bothStripped.charAt(i) == ')') { end -= 1; } else { break; } } } e.setBegin(begin); e.setEnd(end); } private String removeStartPunctuation(String s){ String ret = s; while(ret.length() > 0 && !ALLOWED_CHARACTERS_START_END.contains(ret.substring(0, 1))){ ret = ret.substring(1); } return ret; } private String removeEndPunctuation(String s){ String ret = s; while(ret.length() > 0 && !ALLOWED_CHARACTERS_START_END.contains(ret.substring(ret.length() - 1))){ ret = ret.substring(0, ret.length() - 1); } return ret; } private String cleanValue(String inputValue) { // Remove puncations from the value String value = removeEndPunctuation(removeStartPunctuation(inputValue)); // Remove excess brackets int openingBrackets = StringUtils.countMatches(value, "("); int closingBrackets = StringUtils.countMatches(value, ")"); if (openingBrackets > closingBrackets) { int diff = openingBrackets - closingBrackets; for (int i = 0; i < diff; i++) { if (value.startsWith("(")) { value = value.substring(1); } else { break; } } } else if (openingBrackets < closingBrackets) { int diff = closingBrackets - openingBrackets; for (int i = 0; i < diff; i++) { if (value.endsWith(")")) { value = value.substring(0, value.length() - 1); } else { break; } } } return value; } @Override public AnalysisEngineAction getAction() { return new AnalysisEngineAction(ImmutableSet.of(Entity.class), Collections.emptySet()); } }