//Dstl (c) Crown Copyright 2017 package uk.gov.dstl.baleen.annotators.regex; import java.util.regex.Pattern; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractQuantityRegexAnnotator; import uk.gov.dstl.baleen.uima.data.TextBlock; /** * Annotate volumes within a document using regular expressions * * <p>The document content is searched for things that might represent volumes using regular expressions. * Any extracted volumes are normalized to m^3.</p> * * */ public class Volume extends AbstractQuantityRegexAnnotator { public static final double PINT_TO_M3 = 0.000568; public static final double GALLON_TO_M3 = 0.00454609; private final Pattern m3Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(m\\^3|cubic metre|cubic meter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern cm3Pattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(cm\\^3|cubic centimetre|cubic centimeter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern lPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(l|litre|liter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern mlPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(ml|millilitre|milliliter)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern pintPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(pt|pint)(s)?\\b", Pattern.CASE_INSENSITIVE); private final Pattern gallonPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(gal|gallon)(s)?\\b", Pattern.CASE_INSENSITIVE); /** * Constructor */ public Volume(){ super("m^3", "volume"); } @Override public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException { String text = block.getCoveredText(); process(block, text, m3Pattern, "m^3", 1.0); process(block, text, cm3Pattern, "cm^3", 1/1000000.0); process(block, text, lPattern, "l", 1/1000.0); process(block, text, mlPattern, "ml", 1/1000000.0); process(block, text, pintPattern, "pt", PINT_TO_M3); process(block, text, gallonPattern, "gal", GALLON_TO_M3); } }