DocumentData.java example

Explorer
kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged
package hu.u_szeged.kpe.readers;

import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.utils.NLPUtils;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;

import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.MweDictAnnotator.MWEAnnotation;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;

public class DocumentData implements Comparable<DocumentData>, Serializable {

  private static final long serialVersionUID = -8144005167022088407L;

  /** The total number of DocumentData objects initialized */
  private static int totalDocuments;
  /** Identifier of the document */
  private int documentId;
  /** Location of the document */
  private String file;
  /** Stores ordinal number of the document within the file */
  private int lineNumInFile;
  /** Keyphrases of the document */
  private Map<NGram, Integer> etalonKeyphrases;
  /** Acronyms of the document */
  private Map<String, Integer> acronyms;
  /** Mapping between the formatted strings of the document and their actual formatting */
  protected Map<NGram, Set<String>> formattedStrings;
  /** This can be useful to define reader specific behavior */
  private String documentType;

  public DocumentData(String keyph, String fileName, Class<?> docType) {
    documentId = totalDocuments++;
    etalonKeyphrases = transformKeyphrases(keyph);
    file = fileName;
    documentType = docType.getSimpleName().replace("Reader", "");
  }

  public int getDocId() {
    return documentId;
  }

  public void setDocId(int id) {
    documentId = id;
  }

  public Map<NGram, Set<String>> getFormattedStrings() {
    return formattedStrings;
  }

  public Map<NGram, Integer> getKeyphrases() {
    return etalonKeyphrases;
  }

  public void setKeyphrases(String keyph) {
    etalonKeyphrases = transformKeyphrases(keyph);
  }

  public String getFile() {
    return file;
  }

  public void setFile(String file) {
    this.file = file;
  }

  public Map<String, Integer> getAcronyms() {
    return acronyms;
  }

  public void setAcronyms(Map<String, Integer> acr) {
    acronyms = acr;
  }

  public boolean isFormatted(NGram phraseBuffer) {
    return formattedStrings != null && formattedStrings.containsKey(phraseBuffer);
  }

  // public boolean isFormatted(CoreLabel ew) {
  // return formattedStringParts != null && formattedStringParts.contains(ew);
  // }

  /**
   * Gets all the phrases in the given string and puts them into a map with its occurrences.
   */
  public Map<NGram, Integer> transformKeyphrases(String keyphrases) {
    HashMap<NGram, Integer> hash = new HashMap<NGram, Integer>();
    if (keyphrases == null || keyphrases.length() == 0)
      return hash;
    for (String tok : keyphrases.split("(\r?\n)+")) {
      tok = tok.trim();
      if (tok.length() == 0) {
        continue;
      }

      String newTok;
      if (tok.equalsIgnoreCase("c++")) {
        newTok = tok;
      } else if (tok.toLowerCase().startsWith(".net")) {
        newTok = tok;
      } else {
        newTok = tok.replaceAll("^\\p{Punct}|\\p{Punct}$", "");
      }

      if (newTok.length() < tok.length()) {
        System.err.println("Etalon phrase " + tok + " transformed into " + newTok);
      }
      Annotation annotatedContent = new Annotation(newTok);
      KpeReader.sentenceAnalyzer.annotate(annotatedContent);
      NGram id = new NGram(annotatedContent.get(TokensAnnotation.class));
      Integer value = hash.get(id);
      hash.put(id, value == null ? 1 : ++value);
    }
    return hash;
  }

  public int getLineNumInFile() {
    return lineNumInFile;
  }

  public void setLineNumInFile(int lineNum) {
    lineNumInFile = lineNum;
  }

  public String toString() {
    return documentId + "\t" + file;
  }

  public int compareTo(DocumentData dd) {
    int fileComparison = file.compareTo(dd.getFile());
    return fileComparison == 0 ? (lineNumInFile < dd.getLineNumInFile() ? -1 : 1) : fileComparison;
  }

  public boolean equals(Object o) {
    if (!(o instanceof DocumentData)) {
      return false;
    }
    return file.equals(((DocumentData) o).getFile()) && ((DocumentData) o).getLineNumInFile() == lineNumInFile;
  }

  public int hashCode() {
    return (file + "_" + lineNumInFile).hashCode();
  }

  public boolean containsReference(CoreMap sentence) {
    List<CoreLabel> sentenceTokens = sentence.get(TokensAnnotation.class);
    if (documentType.matches("(?i)semeval|scientific")) {
      nobracket: for (int i = 0; i < sentenceTokens.size(); ++i) {
        if (sentenceTokens.get(i).word().equals("-LRB-")) {
          while (++i < sentenceTokens.size()) {
            CoreLabel nextToken = sentenceTokens.get(i);
            if (!nextToken.word().matches("(,|\\d+|[A-Z&a-z]+\\d{2}|-RRB-)")) {
              continue nobracket;
            } else if (nextToken.word().equals("-RRB-")) {
              return true;
            }
          }
        }
      }
    }
    return false;
  }

  public boolean isScientific() {
    return documentType.matches("(?i)semeval|scientific");
  }

  public TreeMap<Integer, List<CoreMap>> getSections(KpeReader reader, boolean serialize) {
    TreeMap<Integer, List<CoreMap>> sectionsWithSentences = new TreeMap<Integer, List<CoreMap>>();
    List<Annotation> sections = tagAndParse(reader, serialize);
    Iterator<Annotation> sectionIter = sections.iterator();
    while (sectionIter.hasNext()) {
      Annotation sectionAnn = sectionIter.next();
      List<CoreMap> sentencesOfSection = sectionAnn.get(SentencesAnnotation.class);
      sectionsWithSentences.put(sectionsWithSentences.size(), sentencesOfSection);
    }
    return sectionsWithSentences;
  }

  /**
   * Checks for the presence of some critical annotations. In the case some of those entered among the
   * parameters is missing, the texts needs to be re-annotated.
   * 
   * @param a
   *          annotation
   * @param r
   *          reader with the desired annotations
   * @return
   */
  private boolean needsReannotation(Annotation a, KpeReader r) {
    List<CoreMap> sentences = a.get(SentencesAnnotation.class);
    List<CoreLabel> tokens = a.get(TokensAnnotation.class);
    if (tokens == null || sentences == null || tokens.size() == 0 || sentences.size() == 0) {
      return true;
    }
    Set<Class<?>> sentenceAnnotations = sentences.get(0).keySet();
    Set<Class<?>> tokenAnnotations = tokens.get(0).keySet();
    if ((r.getIsMweOn() && !tokenAnnotations.contains(MWEAnnotation.class))
        || (r.getIsNeOn() && !tokenAnnotations.contains(NamedEntityTagAnnotation.class))) {
      return true;
    }
    if (r.getIsSyntaxOn() && !sentenceAnnotations.contains(TreeAnnotation.class)) {
      return true;
    }
    return false;
  }

  @SuppressWarnings("unchecked")
  private List<Annotation> tagAndParse(KpeReader reader, boolean serialize) {
    int numberInDoc = getLineNumInFile();
    File f = new File(file);
    String grammarFile = f.getParent() + "/grammar/" + (numberInDoc > 0 ? numberInDoc : "") + f.getName() + ".gr";
    if (new File(grammarFile).exists()) {
      try {
        ObjectInputStream in = new ObjectInputStream(new BufferedInputStream(new FileInputStream(grammarFile)));
        List<Annotation> documentSections = (List<Annotation>) in.readObject();
        in.close();
        if (documentSections.size() == 0 || needsReannotation(documentSections.get(0), reader)) {
          analyzeSections(documentSections, grammarFile, serialize);
        }
        return documentSections;
      } catch (Exception e) {
        System.err.println("Error with the serialized grammar file " + grammarFile + "\n" + e);
      }
    }
    // text = text.replaceAll("(.)\\1{4,}", "$1");
    // List<String> sectionsOfText = determineSections(text);

    List<String> paragraphs = determineSections(reader.getText(file, lineNumInFile));
    List<Annotation> documentSections = new ArrayList<Annotation>(paragraphs.size());
    try {
      System.err.println(file + " is to be analysed...");
      for (String section : paragraphs) {
        // just some ugly hack to get over such expressions as inequalities that would affect the tokenizer to
        // make dull things
        if (isScientific()) {
          int originalLength = section.length();
          section = section.replaceAll("<([\\S&&[^>]]+) +", "< $1 "); // replaceAll("([<>])(\\S+)", "$1 $2");
          if (originalLength - section.length() < 0) {
            System.err.println("Type-1 scientific document heuristic was applied for " + file);
          }
          // get rid of hyphens as well that might get into the text unintentionally
          originalLength = section.length();
          section = section.replaceAll("([a-z0-9])-\\s+([a-z0-9])", "$1$2");
          if (originalLength - section.length() > 0) {
            System.err.println("Type-2 scientific document heuristic was applied for " + file);
          }
        }
        documentSections.add(new Annotation(section));
      }
      analyzeSections(documentSections, grammarFile, serialize);
    } catch (Exception e) {
      System.err.println("Error occured during the annotation of file " + file + " of line " + lineNumInFile);
      e.printStackTrace();
    }
    return documentSections;
  }

  private void analyzeSections(List<Annotation> documentSections, String grammarFile, boolean serialize) {
    for (Annotation ann : documentSections) {
      KpeReader.sentenceAnalyzer.annotate(ann);
    }
    if (serialize && file != null) {
      NLPUtils.serialize(documentSections, grammarFile);
    }
  }

  /**
   * @param text
   * @return the List of sections of the document
   */
  private List<String> determineSections(String text) {
    // In the simplest case the whole text is handled as one section.
    List<String> sections = new ArrayList<String>(1);
    sections.add(text);
    return sections;
  }

}