package edu.stanford.nlp.ie.qe;
import edu.stanford.nlp.ie.NumberNormalizer;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.tokensregex.CoreMapExpressionExtractor;
import edu.stanford.nlp.ling.tokensregex.Env;
import edu.stanford.nlp.ling.tokensregex.MatchedExpression;
import edu.stanford.nlp.ling.tokensregex.TokenSequencePattern;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Properties;
import java.util.regex.Pattern;
/**
* Extracts quantifiable entities using rules.
*
* @author Angel Chang
*/
public class QuantifiableEntityExtractor {
private Env env;
private Options options;
private CoreMapExpressionExtractor<MatchedExpression> extractor;
public SimpleQuantifiableEntity get(double amount, String unitName) {
return new SimpleQuantifiableEntity(amount, (Unit) env.get(unitName));
}
public List<MatchedExpression> extract(CoreMap annotation) {
if (!annotation.containsKey(CoreAnnotations.NumerizedTokensAnnotation.class)) {
List<CoreMap> mergedNumbers = NumberNormalizer.findAndMergeNumbers(annotation);
annotation.set(CoreAnnotations.NumerizedTokensAnnotation.class, mergedNumbers);
}
return extractor.extractExpressions(annotation);
}
// Initializing
public void init(String name, Properties props)
{
init(new Options(name, props));
}
public void init(Options options) {
this.options = options;
initEnv();
extractor = createExtractor();
}
private CoreMapExpressionExtractor<MatchedExpression> createExtractor() {
List<String> filenames = StringUtils.split(options.grammarFilename, "\\s*[,;]\\s*");
return CoreMapExpressionExtractor.createExtractorFromFiles(env, filenames);
}
private void initEnv() {
env = TokenSequencePattern.getNewEnv();
env.setDefaultTokensAnnotationKey(CoreAnnotations.NumerizedTokensAnnotation.class);
// Do case insensitive matching
env.setDefaultStringMatchFlags(Pattern.CASE_INSENSITIVE);
env.setDefaultStringPatternFlags(Pattern.CASE_INSENSITIVE);
try {
Units.registerUnits(env, options.unitsFilename);
} catch (IOException ex) {
throw new RuntimeException("Error loading units from " + options.unitsFilename, ex);
}
try {
UnitPrefix.registerPrefixes(env, options.prefixFilename);
} catch (IOException ex) {
throw new RuntimeException("Error loading prefixes from " + options.prefixFilename, ex);
}
env.bind("options", options);
env.bind("numcomptype", CoreAnnotations.NumericCompositeTypeAnnotation.class);
env.bind("numcompvalue", CoreAnnotations.NumericCompositeValueAnnotation.class);
}
private static void generatePrefixDefs(String infile, String outfile) throws IOException {
List<UnitPrefix> prefixes = UnitPrefix.loadPrefixes(infile);
PrintWriter pw = IOUtils.getPrintWriter(outfile);
pw.println("SI_PREFIX_MAP = {");
List<String> items = new ArrayList<>();
for (UnitPrefix prefix : prefixes) {
if ("SI".equals(prefix.system)) {
items.add("\"" + prefix.name + "\": " + prefix.getName().toUpperCase());
}
}
pw.println(StringUtils.join(items, ",\n"));
pw.println("}");
pw.println("$SiPrefixes = CreateRegex(Keys(SI_PREFIX_MAP))");
pw.println();
pw.println("SI_SYM_PREFIX_MAP = {");
items.clear();
for (UnitPrefix prefix:prefixes) {
if ("SI".equals(prefix.system)) {
items.add("\"" + prefix.symbol + "\": " + prefix.getName().toUpperCase());
}
}
pw.println(StringUtils.join(items, ",\n"));
pw.println("}");
pw.println("$SiSymPrefixes = CreateRegex(Keys(SI_SYM_PREFIX_MAP))");
pw.close();
}
private static void generateUnitsStage0Rules(String unitsFiles, String infile, String outfile) throws IOException {
Pattern tabPattern = Pattern.compile("\t");
PrintWriter pw = IOUtils.getPrintWriter(outfile);
List<Unit> units = Units.loadUnits(unitsFiles);
pw.println("SI_UNIT_MAP = {");
List<String> items = new ArrayList<>();
for (Unit unit:units) {
if ("SI".equals(unit.prefixSystem)) {
items.add("\"" + unit.name + "\": " + (unit.getType() + "_" + unit.getName()).toUpperCase());
}
}
pw.println(StringUtils.join(items, ",\n"));
pw.println("}");
pw.println("$SiUnits = CreateRegex(Keys(SI_UNIT_MAP))");
pw.println();
pw.println("SI_SYM_UNIT_MAP = {");
items.clear();
for (Unit unit:units) {
if ("SI".equals(unit.prefixSystem)) {
items.add("\"" + unit.symbol + "\": " + (unit.getType() + "_" + unit.getName()).toUpperCase());
}
}
pw.println(StringUtils.join(items, ",\n"));
pw.println("}");
pw.println("$SiSymUnits = CreateRegex(Keys(SI_SYM_UNIT_MAP))");
pw.println();
pw.println("SYM_UNIT_MAP = {");
items.clear();
for (Unit unit:units) {
items.add("\"" + unit.symbol + "\": " + (unit.getType() + "_" + unit.getName()).toUpperCase());
}
pw.println(StringUtils.join(items, ",\n"));
pw.println("}");
pw.println("$SymUnits = CreateRegex(Keys(SYM_UNIT_MAP))");
pw.println();
BufferedReader br = IOUtils.getBufferedFileReader(infile);
String line;
pw.println("ENV.defaults[\"stage\"] = 0");
while ((line = br.readLine()) != null) {
String[] fields = tabPattern.split(line);
pw.println(String.format("{ pattern: ( %s ), action: Tag($0, \"Unit\", %s) }", fields[0], fields[1]));
}
br.close();
pw.close();
}
public static void main(String[] args) throws Exception {
// Generate rules files
Properties props = StringUtils.argsToProperties(args);
Options options = new Options("qe", props);
generatePrefixDefs(options.prefixFilename, options.prefixRulesFilename);
generateUnitsStage0Rules(options.unitsFilename, options.text2UnitMapping, options.unitsRulesFilename);
}
}