//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import com.google.common.collect.ImmutableSet;
import uk.gov.dstl.baleen.core.pipelines.orderers.AnalysisEngineAction;
import uk.gov.dstl.baleen.types.common.Organisation;
import uk.gov.dstl.baleen.uima.BaleenTextAwareAnnotator;
import uk.gov.dstl.baleen.uima.ComparableTextSpan;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Find British Army Units using Regex
*
* <p>A series of Regex are used to find British Army units (e.g. Platoons, Companies) and then they are aggregated based on hierarchy where possible.</p>
*
*
*/
public class BritishArmyUnits extends BaleenTextAwareAnnotator {
private final Pattern section = Pattern.compile("\\b\\d+ Sect\\b");
private final Pattern platoon = Pattern.compile("\\b\\d+ Pl\\b");
private final Pattern company = Pattern.compile("\\b[A-Z] Coy\\b");
private static final int HIERARCHY_SECTION = 1;
private static final int HIERARCHY_PLATOON = 2;
private static final int HIERARCHY_COMPANY = 3;
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
String documentText = block.getCoveredText();
// 1. Find all sections, platoons, companies, etc.
List<ComparableTextSpan> sectionSpans = ComparableTextSpan.buildSpans(documentText, section);
List<ComparableTextSpan> platoonSpans = ComparableTextSpan.buildSpans(documentText, platoon);
List<ComparableTextSpan> companySpans = ComparableTextSpan.buildSpans(documentText, company);
Map<Integer, List<ComparableTextSpan>> hierarchySpans = new HashMap<>();
hierarchySpans.put(HIERARCHY_SECTION, sectionSpans);
hierarchySpans.put(HIERARCHY_PLATOON, platoonSpans);
hierarchySpans.put(HIERARCHY_COMPANY, companySpans);
// 2. Merge when spans are separated by a space or a comma, and the second span is higher in the hierarchy
SortedSet<Integer> hierarchyLevels = new TreeSet<>(hierarchySpans.keySet());
for(Integer level = hierarchyLevels.first(); level < hierarchyLevels.last(); level++){
compareHierarchy(documentText, hierarchySpans, level);
}
// 3. Add spans to JCas as organisations
for(ComparableTextSpan span : hierarchySpans.get(hierarchyLevels.last())){
Organisation org = new Organisation(block.getJCas());
org.setConfidence(1.0);
block.setBeginAndEnd(org, span.getStart(), span.getEnd());
org.setValue(span.getValue());
addToJCasIndex(org);
}
}
private void compareHierarchy(String documentText, Map<Integer, List<ComparableTextSpan>> hierarchySpans, int level) {
List<ComparableTextSpan> newSpans = hierarchySpans.get(level + 1);
if(newSpans == null)
newSpans = new ArrayList<>();
for(ComparableTextSpan s1 : hierarchySpans.get(level)){
ComparableTextSpan s = s1;
for(ComparableTextSpan s2 : hierarchySpans.get(level + 1)){
ComparableTextSpan t = mergeSpansIfPossible(s1, s2, documentText);
if(t != null){
s = t;
newSpans.remove(s2);
break;
}
}
newSpans.add(s);
}
hierarchySpans.put(level + 1, newSpans);
}
private ComparableTextSpan mergeSpansIfPossible(ComparableTextSpan s1, ComparableTextSpan s2, final String documentText) {
if(s1.getStart() < s2.getEnd()) {
String text = documentText.substring(s1.getStart(), s2.getEnd());
if(text.equals(s1.getValue() + " " + s2.getValue()) || text.equals(s1.getValue() + "," + s2.getValue()) || text.equals(s1.getValue() + ", " + s2.getValue())){
return new ComparableTextSpan(s1.getStart(), s2.getEnd(), text);
}
}
return null;
}
@Override
public AnalysisEngineAction getAction() {
return new AnalysisEngineAction(Collections.emptySet(), ImmutableSet.of(Organisation.class));
}
}