//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractQuantityRegexAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Annotate distances within a document using regular expressions * * <p>The document content is searched for things that might represent distances using regular expressions. * Any extracted distances are normalized to m.</p> * * <p>This annotator assumes that nm refers to nautical miles, not nanometres.</p> */ public class Distance extends AbstractQuantityRegexAnnotator { public static final double MI_TO_M = 1609.344; public static final double YD_TO_M = 0.9144; public static final double FT_TO_M = 0.3048; public static final double IN_TO_M = 0.0254; public static final double NM_TO_M = 1852.0; private final Pattern kmPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(km|kilometre|kilometer|click)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern mPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(m|metre|meter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern cmPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(cm|centimetre|centimeter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern mmPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(mm|millimetre|millimeter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern miPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(mile)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern ydPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(yard|yd)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern ftPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(foot|feet|ft)\\b", Pattern.CASE_INSENSITIVE); private final Pattern inPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(inch|inches)\\b", Pattern.CASE_INSENSITIVE); private final Pattern nmPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(hundred|thousand|million|billion|trillion)?[ ]?(nm|nmi|nautical mile(s)?)\\b", Pattern.CASE_INSENSITIVE); /** * Constructor */ public Distance(){ super("m", "distance"); } @Override public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { String text = block.getCoveredText(); process(block, text, kmPattern, "km", 1000); process(block, text, mPattern, "m", 1); process(block, text, cmPattern, "cm", 0.01); process(block, text, mmPattern, "mm", 0.001); process(block, text, miPattern, "mi", MI_TO_M); process(block, text, ydPattern, "yd", YD_TO_M); process(block, text, ftPattern, "ft", FT_TO_M); process(block, text, inPattern, "in", IN_TO_M); process(block, text, nmPattern, "nmi", NM_TO_M); } }