//Dstl (c) Crown Copyright 2017
package uk.gov.dstl.baleen.annotators.regex;
import java.util.regex.Pattern;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import uk.gov.dstl.baleen.annotators.regex.helpers.AbstractQuantityRegexAnnotator;
import uk.gov.dstl.baleen.uima.data.TextBlock;
/**
* Annotate weights within a document using regular expressions
*
* <p>The document content is searched for things that might represent weights using regular expressions.
* Any extracted weights are normalized to KG.</p>
*
*
*/
public class Weight extends AbstractQuantityRegexAnnotator {
public static final double LONG_TON_TO_KG = 1016.0469088;
public static final double STONE_TO_KG = 6.35029318;
public static final double POUNDS_TO_KG = 0.45359237;
public static final double OUNCES_TO_KG = 0.028349523125;
private final Pattern tonnePattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(tonne)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern kgPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(kg|kilogram|kilo)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern gPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(g|gram)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern mgPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(mg|milligram)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern tonPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(ton)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern lbPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(lb)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern stonePattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(st|stone)(s)?\\b", Pattern.CASE_INSENSITIVE);
private final Pattern ozPattern = Pattern.compile("\\b([0-9]+([0-9\\.,]+[0-9])?)[ ]?(thousand|million|billion|trillion)?[ ]?(oz|ounce)(s)?\\b", Pattern.CASE_INSENSITIVE);
/**
* Constructor
*/
public Weight(){
super("kg", "weight");
}
@Override
public void doProcessTextBlock(TextBlock block) throws AnalysisEngineProcessException {
String text = block.getCoveredText();
process(block, text, tonnePattern, "tonne", 1000);
process(block, text, kgPattern, "kg", 1.0);
process(block, text, gPattern, "g", 1.0/1000.0);
process(block, text, mgPattern, "mg", 1.0/1000000.0);
process(block, text, tonPattern, "long ton", LONG_TON_TO_KG);
process(block, text, stonePattern, "st", STONE_TO_KG);
process(block, text, lbPattern, "lb", POUNDS_TO_KG);
process(block, text, ozPattern, "oz", OUNCES_TO_KG);
}
}