WikidictAnnotator.java example

Explorer
Stanford-NLP-master
- CoreNLP-master
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ie.KBPRelationExtractor;
import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.time.TimeAnnotations;
import edu.stanford.nlp.time.Timex;
import edu.stanford.nlp.util.ArgumentParser;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.SystemUtils;
import edu.stanford.nlp.util.logging.Redwood;

import java.io.IOException;
import java.util.*;
import java.util.regex.Pattern;

/**
 * An annotator for entity linking to Wikipedia pages via the Wikidict.
 *
 * @author Gabor Angeli
 */
@SuppressWarnings("FieldCanBeLocal")
public class WikidictAnnotator extends SentenceAnnotator {

  /** A logger for this class */
  private static final Redwood.RedwoodChannels log = Redwood.channels(WikidictAnnotator.class);

  /** A pattern for simple numbers */
  private static final Pattern NUMBER_PATTERN = Pattern.compile("[0-9\\.]+");

  @ArgumentParser.Option(name="threads", gloss="The number of threads to run this annotator on")
  private int threads = 1;

  @ArgumentParser.Option(name="wikidict", gloss="The location of the <text, link, score> TSV file")
  private String wikidictPath = DefaultPaths.DEFAULT_WIKIDICT_TSV;

  @ArgumentParser.Option(name="threshold", gloss="The score threshold under which to discard links")
  private double threshold = 0.0;

  /**
   * The actual Wikidict dictionary.
   */
  private final Map<String, String> dictionary = new HashMap<>(21000000);  // it's gonna be large no matter what

  /**
   * Create a new WikiDict annotator, with the given name and properties.
   */
  public WikidictAnnotator(String name, Properties properties) {
    ArgumentParser.fillOptions(this, name, properties);
    long startTime = System.currentTimeMillis();
    log.info("Reading Wikidict from " + wikidictPath);
    try {
      int i = 0;
      String[] fields = new String[3];
      for (String line : IOUtils.readLines(wikidictPath, "UTF-8")) {
        if (line.charAt(0) == '\t') {
          continue;
        }
        StringUtils.splitOnChar(fields, line, '\t');
        if (i % 1000000 == 0) {
          log.info("Loaded " + i + " entries from Wikidict [" + SystemUtils.getMemoryInUse() + "MB memory used; " + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + " elapsed]");
        }
        // Check that the read entry is above the score threshold
        if (threshold > 0.0) {
          double score = Double.parseDouble(fields[2]);
          if (score < threshold) {
            continue;
          }
        }
        String surfaceForm = fields[0];
        String link = fields[1].intern();  // intern, as most entities have multiple surface forms
        // Add the entry
        dictionary.put(surfaceForm, link);
        i += 1;
      }
      log.info("Done reading Wikidict (" + dictionary.size() + " links read; " + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + " elapsed)");
    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }

  /** @see WikidictAnnotator#WikidictAnnotator(String, Properties) */
  @SuppressWarnings("unused")
  public WikidictAnnotator(Properties properties) {
    this(STANFORD_LINK, properties);

  }

  /**
   * Try to normalize timex values to the form they would appear in the knowledge base.
   * @param timex The timex value to normalize.
   * @return The normalized timex value (e.g., dates have the time of day removed, etc.)
   */
  public static String normalizeTimex(String timex) {
    if (timex.contains("T") && !"PRESENT".equals(timex)) {
      return timex.substring(0, timex.indexOf("T"));
    } else {
      return timex;
    }
  }


  /**
   * Link the given mention, if possible.
   *
   * @param mention The mention to link, as given by {@link EntityMentionsAnnotator}
   *
   * @return The Wikidict entry for the given mention, or the normalized timex / numeric value -- as appropriate.
   */
  public Optional<String> link(CoreMap mention) {
    String surfaceForm = mention.get(CoreAnnotations.OriginalTextAnnotation.class) == null ? mention.get(CoreAnnotations.TextAnnotation.class) : mention.get(CoreAnnotations.OriginalTextAnnotation.class);
    String ner = mention.get(CoreAnnotations.NamedEntityTagAnnotation.class);

    if (ner != null &&
        (KBPRelationExtractor.NERTag.DATE.name.equalsIgnoreCase(ner) ||
          "TIME".equalsIgnoreCase(ner) ||
          "SET".equalsIgnoreCase(ner)) &&
        mention.get(TimeAnnotations.TimexAnnotation.class) != null &&
        mention.get(TimeAnnotations.TimexAnnotation.class).value() != null) {
      // Case: normalize dates
      Timex timex = mention.get(TimeAnnotations.TimexAnnotation.class);
      if (timex.value() != null && !timex.value().equals("PRESENT") &&
          !timex.value().equals("PRESENT_REF") &&
          !timex.value().equals("PAST") &&
          !timex.value().equals("PAST_REF") &&
          !timex.value().equals("FUTURE") &&
          !timex.value().equals("FUTURE_REF")
        ) {
        return Optional.of(normalizeTimex(timex.value()));
      } else {
        return Optional.empty();
      }
    } else if (ner != null &&
        "ORDINAL".equalsIgnoreCase(ner) &&
        mention.get(CoreAnnotations.NumericValueAnnotation.class) != null) {
      // Case: normalize ordinals
      Number numericValue = mention.get(CoreAnnotations.NumericValueAnnotation.class);
      return Optional.of(numericValue.toString());
    } else if (NUMBER_PATTERN.matcher(surfaceForm).matches()) {
      // Case: keep numbers as is
      return Optional.of(surfaceForm);
    } else if (ner != null && !"O".equals(ner) && dictionary.containsKey(surfaceForm)) {
      // Case: link with Wikidict
      return Optional.of(dictionary.get(surfaceForm));
    } else {
      // Else: keep the surface form as is
      return Optional.empty();
    }
  }

  /** {@inheritDoc} */
  @Override
  protected int nThreads() {
    return threads;
  }

  /** {@inheritDoc} */
  @Override
  protected long maxTime() {
    return -1L;
  }

  /** {@inheritDoc} */
  @Override
  protected void doOneSentence(Annotation annotation, CoreMap sentence) {
    for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) {
      token.set(CoreAnnotations.WikipediaEntityAnnotation.class, "O");
    }

    for (CoreMap mention : sentence.get(CoreAnnotations.MentionsAnnotation.class)) {
      Optional<String> canonicalName = link(mention);
      if (canonicalName.isPresent()) {
        mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, canonicalName.get());
        for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) {
          token.set(CoreAnnotations.WikipediaEntityAnnotation.class, canonicalName.get());
        }
      }
    }
  }

  /** {@inheritDoc} */
  @Override
  protected void doOneFailedSentence(Annotation annotation, CoreMap sentence) {
    /* do nothing */
  }

  /** {@inheritDoc} */
  @Override
  public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
    return Collections.singleton(CoreAnnotations.WikipediaEntityAnnotation.class);
  }

  /** {@inheritDoc} */
  @Override
  public Set<Class<? extends CoreAnnotation>> requires() {
    Set<Class<? extends CoreAnnotation>> requirements = new HashSet<>(Arrays.asList(
        CoreAnnotations.TextAnnotation.class,
        CoreAnnotations.TokensAnnotation.class,
        CoreAnnotations.SentencesAnnotation.class,
        CoreAnnotations.OriginalTextAnnotation.class,
        CoreAnnotations.MentionsAnnotation.class
    ));
    return Collections.unmodifiableSet(requirements);
  }


  /**
   * A debugging method to try entity linking sentences from the console.
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {
    Properties props = StringUtils.argsToProperties(args);
    props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions,entitylink");
    StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
    IOUtils.console("sentence> ", line -> {
      Annotation ann = new Annotation(line);
      pipeline.annotate(ann);
      List<CoreLabel> tokens = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0).get(CoreAnnotations.TokensAnnotation.class);
      System.err.println(StringUtils.join(tokens.stream().map(x -> x.get(CoreAnnotations.WikipediaEntityAnnotation.class)), "  "));
    });
  }
}