LingData.java example

Explorer

web-entity-extractor-ACL2014-master
- src
  - edu
    - stanford
      - nlp
        semparse
        open
        Main.java
        core
        AllOptions.java
        InteractiveDemo.java
        OpenSemanticParser.java
        ParallelizedTrainer.java
        eval
        CandidateStatistics.java
        EvaluationCase.java
        EvaluationNormalFail.java
        EvaluationSuccess.java
        EvaluationSuperFail.java
        Evaluator.java
        EvaluatorStatistics.java
        IterativeTester.java
        dataset
        Criteria.java
        CriteriaExactMatch.java
        CriteriaGeneralWeb.java
        Dataset.java
        Example.java
        ExampleCached.java
        ExpectedAnswer.java
        ExpectedAnswerCriteriaMatch.java
        ExpectedAnswerInjectiveMatch.java
        IRScore.java
        entity
        TargetEntity.java
        TargetEntityNearMatch.java
        TargetEntityPersonName.java
        TargetEntityString.java
        TargetEntitySubstring.java
        library
        DatasetLibrary.java
        JSONDataset.java
        JSONDatasetReader.java
        UnaryDatasets.java
        ling
        AveragedWordVector.java
        BrownClusterTable.java
        ClusterRepnUtils.java
        CreateTypeEntityFeatures.java
        FrequencyTable.java
        LingData.java
        LingTester.java
        LingUtils.java
        QueryTypeTable.java
        WordNetClusterTable.java
        WordVectorTable.java
        model
        AdvancedWordVectorGradient.java
        AdvancedWordVectorParams.java
        AdvancedWordVectorParamsFullRank.java
        AdvancedWordVectorParamsLowRank.java
        FeatureCountPruner.java
        FeatureDomainPruner.java
        FeatureMatcher.java
        FeatureVector.java
        Learner.java
        LearnerBaseline.java
        LearnerMaxEnt.java
        LearnerMaxEntWithBeamSearch.java
        Params.java
        candidate
        Candidate.java
        CandidateGenerator.java
        CandidateGroup.java
        PathEntry.java
        PathEntryWithRange.java
        PathUtils.java
        TreePattern.java
        TreePatternAndRange.java
        feature
        FeatureExtractor.java
        FeaturePostProcessor.java
        FeaturePostProcessorConjoin.java
        FeatureType.java
        FeatureTypeCutRange.java
        FeatureTypeHoleBased.java
        FeatureTypeLinguisticsBased.java
        FeatureTypeNaiveEntityBased.java
        FeatureTypeNodeBased.java
        FeatureTypePathBased.java
        tree
        HTMLFixer.java
        KNode.java
        KNodeUtils.java
        KnowledgeTreeBuilder.java
        util
        BipartiteMatcher.java
        EditDistance.java
        Multiset.java
        Parallelizer.java
        SHA.java
        SearchResult.java
        StringDoubleArrayList.java
        StringDoublePair.java
        StringSampler.java
        VectorAverager.java
        WebUtils.java

package edu.stanford.nlp.semparse.open.ling;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations.LemmaAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.NormalizedNamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.PartOfSpeechAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.semparse.open.Main;
import fig.basic.LogInfo;
import fig.basic.Option;

/**
 * Interface with Stanford CoreNLP to do basic things like POS tagging and NER.
 * 
 * @author akchou
 */
@JsonIgnoreProperties(ignoreUnknown=true)
@JsonInclude(JsonInclude.Include.NON_NULL)
public class LingData {
  public static class Options {
    @Option(gloss = "What CoreNLP annotators to run")
    public List<String> annotators = Arrays.asList("tokenize", "ssplit", "pos", "lemma", "ner");

    @Option(gloss = "Whether to use CoreNLP annotators")
    public boolean useAnnotators = true;

    @Option(gloss = "Whether to be case sensitive")
    public boolean caseSensitive = true;
    
    @Option(gloss = "Linguistic cache filename")
    public String lingCacheFilename = null;
    
    @Option(gloss = "Frequency of saving linguistic cache periodically")
    public int saveLingCacheFrequency = 50000;
  }
  public static Options opts = new Options();
  public static StanfordCoreNLP pipeline = null;
  
  // Update this when changing LingData's structure.
  private static final String VERSION = "4";
  
  private static final Set<String> AUX_VERBS = new HashSet<>(Arrays.asList(
      "is", "are", "was", "were", "am", "be", "been", "will",
      "shall", "have", "has", "had", "would", "could", "should", 
      "do", "does", "did", "can", "may", "might", "must", "seem"));
  
  public static final Set<String> OPEN_CLASS_POS_TAGS = new HashSet<>(Arrays.asList(
      "CD", "FW", "JJ", "JJR", "JJS", "NN", "NNP", "NNPS", "NNS", "RB",
      "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"));
  
  /**
   * OPEN = nouns, general verbs, adjectives, adverbs, numbers
   * AUX = auxiliary verbs (which is a special case of CLOSE)
   * CLOSE = other POS
   */
  public enum POSType { OPEN, AUX, CLOSE };

  // Tokenization of input.
  @JsonProperty
  public final List<String> tokens;
  @JsonProperty
  public final List<String> lemmaTokens;  // Lemmatized version

  // Syntactic information from JavaNLP.
  @JsonProperty
  public final List<String> posTags;  // POS tags
  @JsonProperty
  public final List<POSType> posTypes;  // type of POS tag
  @JsonProperty
  public final List<String> nerTags;  // NER tags
  @JsonProperty
  public final List<String> nerValues;  // NER values (contains times, dates, etc.)
  @JsonIgnore
  public final int length;

  public LingData(String utterance) {
    // Stanford tokenizer doesn't break hyphens.
    // Replace hyphens with spaces for utterances like
    // "Spanish-speaking countries" but not for "2012-03-28".
    StringBuilder buf = new StringBuilder(utterance);
    for (int i = 0; i < buf.length(); i++) {
      if (buf.charAt(i) == '-' && (i+1 < buf.length() && Character.isLetter(buf.charAt(i+1))))
        buf.setCharAt(i, ' ');
    }
    utterance = buf.toString();
    
    tokens = new ArrayList<>();
    posTags = new ArrayList<>();
    posTypes = new ArrayList<>();
    nerTags = new ArrayList<>();
    nerValues = new ArrayList<>();
    lemmaTokens = new ArrayList<>();

    if (opts.useAnnotators) {
      initModels();
      Annotation annotation = pipeline.process(utterance);

      for (CoreLabel token : annotation.get(CoreAnnotations.TokensAnnotation.class)) {
        String word = token.get(TextAnnotation.class);
        String wordLower = word.toLowerCase();
        if (opts.caseSensitive) {
          tokens.add(word);
        } else {
          tokens.add(wordLower);
        }
        String pos = token.get(PartOfSpeechAnnotation.class).intern();
        posTags.add(pos);
        posTypes.add(getPOSType(pos, word));
        
        nerTags.add(token.get(NamedEntityTagAnnotation.class).intern());
        lemmaTokens.add(token.get(LemmaAnnotation.class));
        nerValues.add(token.get(NormalizedNamedEntityTagAnnotation.class));
      }
      
    } else {
      // Create tokens crudely
      for (String token : utterance.trim().split("\\s+")) {
        tokens.add(token);
        lemmaTokens.add(token);
        try {
          Double.parseDouble(token);
          posTags.add("CD");
          posTypes.add(POSType.OPEN);
          nerTags.add("NUMBER");
          nerValues.add(token);
        } catch (NumberFormatException e ){
          posTags.add("UNK");
          posTypes.add(POSType.OPEN);
          nerTags.add("UNK");
          nerValues.add("UNK");
        }
      }
    }
    
    this.length = tokens.size();
  }
  
  private static POSType getPOSType(String pos, String word) {
    if (AUX_VERBS.contains(word.toLowerCase()) && pos.charAt(0) == 'V')
      return POSType.AUX;
    if (OPEN_CLASS_POS_TAGS.contains(pos))
      return POSType.OPEN;
    return POSType.CLOSE;
  }

  @JsonCreator
  public LingData(@JsonProperty("tokens") List<String> tokens,
                  @JsonProperty("lemmaTokens") List<String> lemmaTokens,
                  @JsonProperty("posTags") List<String> posTags,
                  @JsonProperty("posTypes") List<POSType> posTypes,
                  @JsonProperty("nerTags") List<String> nerTags,
                  @JsonProperty("nerValues") List<String> nerValues) {
    this.tokens = tokens;
    this.lemmaTokens = lemmaTokens;
    this.posTags = posTags;
    this.posTypes = posTypes;
    this.nerTags = nerTags;
    this.nerValues = nerValues;
    this.length = tokens.size();
  }
  
  public static void initModels() {
    initCoreNLPModels();
    BrownClusterTable.initModels();
    WordVectorTable.initModels();
    FrequencyTable.initModels();
    WordNetClusterTable.initModels();
    QueryTypeTable.initModels();
  }

  public static void initCoreNLPModels() {
    if (pipeline != null) return;
    LogInfo.begin_track("Initializing Core NLP Models ...");
    Properties props = new Properties();
    props.put("annotators", String.join(",", opts.annotators));
    if (opts.caseSensitive) {
      props.put("pos.model", "edu/stanford/nlp/models/pos-tagger/english-bidirectional/english-bidirectional-distsim.tagger");
      props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.distsim.crf.ser.gz");
    } else {
      props.put("pos.model", "edu/stanford/nlp/models/pos-tagger/english-caseless-left3words-distsim.tagger");
      props.put("ner.model", "edu/stanford/nlp/models/ner/english.all.3class.caseless.distsim.crf.ser.gz,edu/stanford/nlp/models/ner/english.conll.4class.caseless.distsim.crf.ser.gz");
    }
    pipeline = new StanfordCoreNLP(props);
    LogInfo.end_track();
  }

  @Override
  public boolean equals(Object o) {
    if (this == o) return true;
    if (o == null || getClass() != o.getClass()) return false;

    LingData that = (LingData) o;
    if (!lemmaTokens.equals(that.lemmaTokens)) return false;
    if (!nerTags.equals(that.nerTags)) return false;
    if (!posTags.equals(that.posTags)) return false;
    if (!tokens.equals(that.tokens)) return false;
    return true;
  }

  // Return a string representing the tokens between start and end.
  public String phrase(int start, int end) {
    return sliceSequence(tokens, start, end);
  }
  public String lemmaPhrase(int start, int end) {
    return sliceSequence(lemmaTokens, start, end);
  }

  private static String sliceSequence(List<String> items, int start, int end) {
    if (start >= end) throw new RuntimeException("Bad indices");
    if (end - start == 1) return items.get(start);
    StringBuilder out = new StringBuilder();
    for (int i = start; i < end; i++) {
      if (out.length() > 0) out.append(' ');
      out.append(items.get(i));
    }
    return out.toString();
  }

  // If all the tokens in [start, end) have the same nerValues, but not
  // start-1 and end+1 (in other words, [start, end) is maximal), then return
  // the normalizedTag.  Example: queryNerTag = "DATE".
  public String getNormalizedNerSpan(String queryTag, int start, int end) {
    String value = nerValues.get(start);
    if (!queryTag.equals(nerTags.get(start))) return null;
    if (start-1 >= 0 && value.equals(nerValues.get(start-1))) return null;
    if (end < nerValues.size() && value.equals(nerValues.get(end))) return null;
    for (int i = start+1; i < end; i++)
      if (!value.equals(nerValues.get(i))) return null;
    return value;
  }

  public String getCanonicalPos(int index) {
    return getCanonicalPos(posTags.get(index));
  }

  private String getCanonicalPos(String pos) {
    if (pos.startsWith("N")) return "N";
    if (pos.startsWith("V")) return "V";
    if (pos.startsWith("W")) return "W";
    return pos;
  }
  
  // ============================================================
  // Caching
  // ============================================================
  
  protected static Map<String, LingData> cache = new ConcurrentHashMap<>();
  
  public static LingData get(String string) {
    LingData data = cache.get(string);
    if (data == null) {
      data = new LingData(string);
      cache.put(string, data);
      if (opts.saveLingCacheFrequency > 0 && cache.size() % opts.saveLingCacheFrequency == 0) {
        LogInfo.logs("Linguistic Cache size: %d", cache.size());
        saveCache();
      }
    }
    return data;
  }

  public static LingData getNoCache(String string) {
    LingData data = cache.get(string);
    if (data == null) {
      data = new LingData(string);
    }
    return data;
  }
  
  private static String getCachePath(String version) {
    String path = opts.lingCacheFilename;
    if (path == null) {
      path = "cache/ling-" + Main.opts.dataset + ".v" + version + ".json.gz";
    }
    new File(path).getParentFile().mkdirs();
    return path;
  }
  
  private static int lastSavedCacheSize = 0; 
  
  public synchronized static void saveCache() {
    LogInfo.begin_track("Saving linguistic data to cache ...");
    if (cache.size() == lastSavedCacheSize) {
      LogInfo.log("Cache unchanged.");    // Do nothing
    } else {
      String cachePath = getCachePath(VERSION);
      try (FileOutputStream out = new FileOutputStream(cachePath);
          GZIPOutputStream so = new GZIPOutputStream(out)) {
        out.getChannel().lock();
        ObjectMapper mapper = new ObjectMapper();
        lastSavedCacheSize = cache.size();
        mapper.writeValue(so, cache);
        LogInfo.logs("Written cache to %s", cachePath);
      } catch (Exception e) {
        LogInfo.warnings("Cache cannot be saved to %s!", cachePath);
        LogInfo.warning(e);
        e.printStackTrace();
      }
    }
    LogInfo.end_track();
  }

  public static void loadCache() {
    LogInfo.begin_track("Loading linguistic data from cache ...");
    String cachePath = getCachePath(VERSION);
    try (GZIPInputStream si = new GZIPInputStream(new FileInputStream(cachePath))) {
      ObjectMapper mapper = new ObjectMapper();
      cache = mapper.readValue(si, new TypeReference<ConcurrentHashMap<String, LingData>>(){});
      lastSavedCacheSize = cache.size();
      LogInfo.logs("Cache loaded from %s", cachePath);
    } catch (FileNotFoundException e) {
      LogInfo.warnings("Cache cannot be loaded: File %s does not exist", cachePath);
    } catch (Exception e) {
      LogInfo.warnings("Cache cannot be loaded: Cache corruped!");
      LogInfo.warning(e);
    }
    LogInfo.end_track();
  }

  // ============================================================
  // Getters
  // ============================================================
  
  /**
   * @param lemmatized
   *     whether to use lemmatized tokens
   * @param onlyOpenPOS
   *     whether to return only tokens with open-class POS tags (noun, verb, adjective, adverb)
   * @return A set of tokens
   */
  public Set<String> getTokens(boolean lemmatized, boolean onlyOpenPOS) {
    Set<String> answer = new HashSet<>();
    for (int i = 0; i < length; i++) {
      if (!onlyOpenPOS || posTypes.get(i) == POSType.OPEN) {
        if (lemmatized) {
          answer.add(lemmaTokens.get(i));
        } else {
          answer.add(tokens.get(i));
        }
      }
    }
    return answer;
  }
}