package edu.stanford.nlp.pipeline; import edu.stanford.nlp.ie.KBPRelationExtractor; import edu.stanford.nlp.io.IOUtils; import edu.stanford.nlp.ling.CoreAnnotation; import edu.stanford.nlp.ling.CoreAnnotations; import edu.stanford.nlp.ling.CoreLabel; import edu.stanford.nlp.time.TimeAnnotations; import edu.stanford.nlp.time.Timex; import edu.stanford.nlp.util.ArgumentParser; import edu.stanford.nlp.util.CoreMap; import edu.stanford.nlp.util.StringUtils; import edu.stanford.nlp.util.SystemUtils; import edu.stanford.nlp.util.logging.Redwood; import java.io.IOException; import java.util.*; import java.util.regex.Pattern; /** * An annotator for entity linking to Wikipedia pages via the Wikidict. * * @author Gabor Angeli */ @SuppressWarnings("FieldCanBeLocal") public class WikidictAnnotator extends SentenceAnnotator { /** A logger for this class */ private static final Redwood.RedwoodChannels log = Redwood.channels(WikidictAnnotator.class); /** A pattern for simple numbers */ private static final Pattern NUMBER_PATTERN = Pattern.compile("[0-9\\.]+"); @ArgumentParser.Option(name="threads", gloss="The number of threads to run this annotator on") private int threads = 1; @ArgumentParser.Option(name="wikidict", gloss="The location of the <text, link, score> TSV file") private String wikidictPath = DefaultPaths.DEFAULT_WIKIDICT_TSV; @ArgumentParser.Option(name="threshold", gloss="The score threshold under which to discard links") private double threshold = 0.0; /** * The actual Wikidict dictionary. */ private final Map<String, String> dictionary = new HashMap<>(21000000); // it's gonna be large no matter what /** * Create a new WikiDict annotator, with the given name and properties. */ public WikidictAnnotator(String name, Properties properties) { ArgumentParser.fillOptions(this, name, properties); long startTime = System.currentTimeMillis(); log.info("Reading Wikidict from " + wikidictPath); try { int i = 0; String[] fields = new String[3]; for (String line : IOUtils.readLines(wikidictPath, "UTF-8")) { if (line.charAt(0) == '\t') { continue; } StringUtils.splitOnChar(fields, line, '\t'); if (i % 1000000 == 0) { log.info("Loaded " + i + " entries from Wikidict [" + SystemUtils.getMemoryInUse() + "MB memory used; " + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + " elapsed]"); } // Check that the read entry is above the score threshold if (threshold > 0.0) { double score = Double.parseDouble(fields[2]); if (score < threshold) { continue; } } String surfaceForm = fields[0]; String link = fields[1].intern(); // intern, as most entities have multiple surface forms // Add the entry dictionary.put(surfaceForm, link); i += 1; } log.info("Done reading Wikidict (" + dictionary.size() + " links read; " + Redwood.formatTimeDifference(System.currentTimeMillis() - startTime) + " elapsed)"); } catch (Exception e) { throw new RuntimeException(e); } } /** @see WikidictAnnotator#WikidictAnnotator(String, Properties) */ @SuppressWarnings("unused") public WikidictAnnotator(Properties properties) { this(STANFORD_LINK, properties); } /** * Try to normalize timex values to the form they would appear in the knowledge base. * @param timex The timex value to normalize. * @return The normalized timex value (e.g., dates have the time of day removed, etc.) */ public static String normalizeTimex(String timex) { if (timex.contains("T") && !"PRESENT".equals(timex)) { return timex.substring(0, timex.indexOf("T")); } else { return timex; } } /** * Link the given mention, if possible. * * @param mention The mention to link, as given by {@link EntityMentionsAnnotator} * * @return The Wikidict entry for the given mention, or the normalized timex / numeric value -- as appropriate. */ public Optional<String> link(CoreMap mention) { String surfaceForm = mention.get(CoreAnnotations.OriginalTextAnnotation.class) == null ? mention.get(CoreAnnotations.TextAnnotation.class) : mention.get(CoreAnnotations.OriginalTextAnnotation.class); String ner = mention.get(CoreAnnotations.NamedEntityTagAnnotation.class); if (ner != null && (KBPRelationExtractor.NERTag.DATE.name.equalsIgnoreCase(ner) || "TIME".equalsIgnoreCase(ner) || "SET".equalsIgnoreCase(ner)) && mention.get(TimeAnnotations.TimexAnnotation.class) != null && mention.get(TimeAnnotations.TimexAnnotation.class).value() != null) { // Case: normalize dates Timex timex = mention.get(TimeAnnotations.TimexAnnotation.class); if (timex.value() != null && !timex.value().equals("PRESENT") && !timex.value().equals("PRESENT_REF") && !timex.value().equals("PAST") && !timex.value().equals("PAST_REF") && !timex.value().equals("FUTURE") && !timex.value().equals("FUTURE_REF") ) { return Optional.of(normalizeTimex(timex.value())); } else { return Optional.empty(); } } else if (ner != null && "ORDINAL".equalsIgnoreCase(ner) && mention.get(CoreAnnotations.NumericValueAnnotation.class) != null) { // Case: normalize ordinals Number numericValue = mention.get(CoreAnnotations.NumericValueAnnotation.class); return Optional.of(numericValue.toString()); } else if (NUMBER_PATTERN.matcher(surfaceForm).matches()) { // Case: keep numbers as is return Optional.of(surfaceForm); } else if (ner != null && !"O".equals(ner) && dictionary.containsKey(surfaceForm)) { // Case: link with Wikidict return Optional.of(dictionary.get(surfaceForm)); } else { // Else: keep the surface form as is return Optional.empty(); } } /** {@inheritDoc} */ @Override protected int nThreads() { return threads; } /** {@inheritDoc} */ @Override protected long maxTime() { return -1L; } /** {@inheritDoc} */ @Override protected void doOneSentence(Annotation annotation, CoreMap sentence) { for (CoreLabel token : sentence.get(CoreAnnotations.TokensAnnotation.class)) { token.set(CoreAnnotations.WikipediaEntityAnnotation.class, "O"); } for (CoreMap mention : sentence.get(CoreAnnotations.MentionsAnnotation.class)) { Optional<String> canonicalName = link(mention); if (canonicalName.isPresent()) { mention.set(CoreAnnotations.WikipediaEntityAnnotation.class, canonicalName.get()); for (CoreLabel token : mention.get(CoreAnnotations.TokensAnnotation.class)) { token.set(CoreAnnotations.WikipediaEntityAnnotation.class, canonicalName.get()); } } } } /** {@inheritDoc} */ @Override protected void doOneFailedSentence(Annotation annotation, CoreMap sentence) { /* do nothing */ } /** {@inheritDoc} */ @Override public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() { return Collections.singleton(CoreAnnotations.WikipediaEntityAnnotation.class); } /** {@inheritDoc} */ @Override public Set<Class<? extends CoreAnnotation>> requires() { Set<Class<? extends CoreAnnotation>> requirements = new HashSet<>(Arrays.asList( CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.SentencesAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.MentionsAnnotation.class )); return Collections.unmodifiableSet(requirements); } /** * A debugging method to try entity linking sentences from the console. * @throws IOException */ public static void main(String[] args) throws IOException { Properties props = StringUtils.argsToProperties(args); props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,entitymentions,entitylink"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); IOUtils.console("sentence> ", line -> { Annotation ann = new Annotation(line); pipeline.annotate(ann); List<CoreLabel> tokens = ann.get(CoreAnnotations.SentencesAnnotation.class).get(0).get(CoreAnnotations.TokensAnnotation.class); System.err.println(StringUtils.join(tokens.stream().map(x -> x.get(CoreAnnotations.WikipediaEntityAnnotation.class)), " ")); }); } }