//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractQuantityRegexAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Annotate areas within a document using regular expressions * * <p>The document content is searched for things that might represent areas using regular expressions. * Any extracted areas are normalized to m^2.</p> * * */ public class Area extends AbstractQuantityRegexAnnotator { public static final double MM2_TO_M2 = 0.000001; public static final double CM2_TO_M2 = 0.0001; public static final double KM2_TO_M2 = 1000000.0; public static final double MI2_TO_M2 = 2589988.1; public static final double YD2_TO_M2 = 0.83612739; public static final double FT2_TO_M2 = 0.092903044; public static final double IN2_TO_M2 = 0.000064516; public static final double ACRE_TO_M2 = 4046.8564; public static final double HECTARE_TO_M2 = 10000.0; private final Pattern m2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(m\\^2|square metre|square meter|square m)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern mm2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(mm\\^2|square millimetre|square millimeter|square mm)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern cm2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(cm\\^2|square centimetre|square centimeter|square cm)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern km2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(km\\^2|square kilometre|square kilometers|square km)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern mi2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(mi\\^2|square miles|square mi)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern yd2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(yd\\^2|square yard|square yd)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern ft2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(ft\\^2|square foot|square feet|square ft)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern in2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(in\\^2|square inch|square in|square inche)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern ha2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(hectare|ha)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern acre2Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(acre)(s)?\\b", Pattern.CASE_INSENSITIVE); /** * Constructor */ public Area(){ super("m^2", "area"); } @Override public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { String text = block.getCoveredText(); process(block, text, m2Pattern, "m^2", 1); process(block, text, cm2Pattern, "cm^2", CM2_TO_M2); process(block, text, mm2Pattern, "mm^2", MM2_TO_M2); process(block, text, km2Pattern, "km^2", KM2_TO_M2); process(block, text, mi2Pattern, "mi^2", MI2_TO_M2); process(block, text, yd2Pattern, "yd^2", YD2_TO_M2); process(block, text, ft2Pattern, "ft^2", FT2_TO_M2); process(block, text, in2Pattern, "in^2", IN2_TO_M2); process(block, text, acre2Pattern, "acre", ACRE_TO_M2); process(block, text, ha2Pattern, "ha", HECTARE_TO_M2); } }