SemEvalReader.java example

Explorer
kpe-master
- src
  - edu
    - stanford
      - nlp
        pipeline
        HunTokenizerAnnotator.java
        MweDictAnnotator.java
        MyCleanXmlAnnotator.java
        NormalizerAnnotator.java
        OwnMorphaAnnotator.java
        OwnPOSTaggerAnnotator.java
        StopWordAnnotator.java
        SzTEAnnotationPipeline.java
        SzTECoreNLP.java
        process
        HunPTBLexer.java
        HunTokenizer.java
        tagger
        maxent
        OwnMaxentTagger.java
        OwnTestSentence.java
  - hu
    - u_szeged
package hu.u_szeged.kpe.readers;

import hu.u_szeged.utils.NLPUtils;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;

public class SemEvalReader extends ScientificReader {

  /** Files having the value of DEFAULT_EXTENSION as their suffices will be processed only. */
  private static final String DEFAULT_EXTENSION = ".txt.final";
  private Map<String, String> readerKeyphrases = null;
  private Map<String, String> authorKeyphrases = null;

  protected void setDetails() {
    fileType = DEFAULT_EXTENSION;
  }

  public List<DocumentData> getContent(String dir, String file) {
    if (readerKeyphrases == null) {
      readerKeyphrases = etalonKeyphrases("reader", dir);
      authorKeyphrases = etalonKeyphrases("author", dir);
      if (goldAnnotation) {
        // readerKeyphrases.putAll(authorKeyphrases);
        for (Entry<String, String> readers : readerKeyphrases.entrySet()) {
          String authors = authorKeyphrases.get(readers.getKey());
          if (authors.length() > 0) {
            readers.setValue(readers.getValue() + "\r\n" + authors);
          }
        }
      }
    }
    String fileId = file.replaceAll(".*[\\\\/](([CHIJ]-)?\\d+)" + fileType, "$1");
    List<DocumentData> toReturn = new ArrayList<DocumentData>(1);
    toReturn.add(new ContestDocumentData(readerKeyphrases.get(fileId), authorKeyphrases.get(fileId), file, this.getClass()));
    return toReturn;
  }

  public String getText(String file, int numberWithinFile) {
    List<String> lines = new LinkedList<>();
    NLPUtils.readDocToCollection(file, lines);
    List<String> reducedLines = new ArrayList<String>(lines.size());
    List<Integer> lengths = new ArrayList<Integer>(lines.size());
    List<Integer> tokens = new ArrayList<Integer>(lines.size());
    List<Number> charPerToken = new ArrayList<Number>(lines.size());

    int sum = 0, tokenSum = 0, i = 0, lineNum = 0;
    for (; i < lines.size(); ++i) {
      if (lines.get(i).trim().length() > 0) {
        reducedLines.add(lines.get(i++));
        break;
      }
    }
    // Map<String, Integer> multipleLines = new HashMap<String, Integer>();
    // Map<String, List<String>> differentNumericLineForms = new HashMap<String, List<String>>();
    try {
      while (!lines.get(i++).matches("(?i)\\s*abstract\\s*")) {
        continue;
      }
    } catch (IndexOutOfBoundsException e) {
      i = 1;
      System.err.println("No abstract for file " + file);
    }
    boolean referencesSeen = false;
    for (; i < lines.size(); ++i) {
      String line = lines.get(i);
      if (!referencesSeen && line.matches("(?i)(\\d+\\.?\\s+)?references?") && i + 1 < lines.size() && lines.get(i + 1).matches("\\[1\\].*"))
        referencesSeen = true;
      if (!isUnnecessaryLine(line)) {
        lineNum++;
        reducedLines.add(line);
        int tokenNumber = line.split(" +").length;
        if (!referencesSeen) {
          // String modifiedLine = line.replaceAll("\\s*\\d+\\s*", "_NUM_");
          // if (!line.endsWith(".") && !line.toLowerCase().startsWith("theorem")){
          // if (!modifiedLine.equals(line)){
          // List<String> differentFormsOfLine = differentNumericLineForms.get(modifiedLine);
          // differentFormsOfLine = differentFormsOfLine == null ? new ArrayList<String>() :
          // differentFormsOfLine;
          // differentFormsOfLine.add(line);
          // differentNumericLineForms.put(modifiedLine, differentFormsOfLine);
          // }
          // Integer val = multipleLines.get(modifiedLine);
          // multipleLines.put(modifiedLine, val == null ? 1 : ++val);
          // }
          sum += line.length();
          lengths.add(line.length());
          tokenSum += tokenNumber;
          tokens.add(tokenNumber);
          charPerToken.add((double) (line.length() - tokenNumber) / tokenNumber);
        }
      }
    }

    Set<String> toSkip = new HashSet<String>();
    // for (Entry<String, Integer> multipleLine : multipleLines.entrySet()){
    // int timesOfOccurrence = multipleLine.getValue();
    // if (timesOfOccurrence > 4){
    // List<String> formsOfLine = differentNumericLineForms.get(multipleLine.getKey());
    // System.err.println(timesOfOccurrence + "\t" + multipleLine);
    // while (--timesOfOccurrence >= 0){
    // String actualForm = formsOfLine == null ? multipleLine.getKey() :
    // formsOfLine.get(timesOfOccurrence);
    // toSkip.add(actualForm);
    // sum -= actualForm.length();
    // int tokenNumber = actualForm.trim().split(" +").length;
    // tokenSum -= tokenNumber;
    // lengths.remove((Integer) actualForm.length());
    // tokens.remove((Integer) tokenNumber);
    // charPerToken.remove((double) (actualForm.length() - tokenNumber) / tokenNumber);
    // }
    // }
    // }
    double avgCharPerToken = NLPUtils.mean(charPerToken), devCharPerToken = 0.0;
    double devChars = 1.0, avgChars = (double) sum / lengths.size();
    double devTokens = 0.0, avgTokens = (double) tokenSum / tokens.size();

    for (int index = 0; index < lengths.size(); ++index) {
      devChars += Math.pow(avgChars - lengths.get(index), 2);
      devTokens += Math.pow(avgTokens - tokens.get(index), 2);
      devCharPerToken += Math.pow(avgCharPerToken - charPerToken.get(index).doubleValue(), 2);
    }

    String[] article = processText(reducedLines, avgChars, Math.sqrt(devChars / lineNum), avgTokens, Math.sqrt(devTokens / tokens.size()), avgCharPerToken,
        Math.sqrt(devCharPerToken / charPerToken.size()), toSkip);

    // List<String> sentences = new ArrayList<String>();
    // sentences.add(article[0]);
    // String allKeyphrases = "";
    /*
     * Set<String> phrases = new TreeSet<String>(); for (Entry<String, String> kphs : authorKeyphrases.entrySet()){ if (kphs.getKey().charAt(0) !=
     * fileId.charAt(0)) continue; String[] kphArray = kphs.getValue().split("\r*\n"); for (String kph : kphArray)
     * phrases.add(kph.toLowerCase().trim()); } for (String phrase : phrases)
     */
    // for (String phrase : authorKeyphrases.values())
    // if (phrase.replaceAll("\\s+", " ").length() != 1)
    // allKeyphrases += phrase + "\r\n";
    return article[0];
  }

  protected String[] processText(List<String> reducedLines, double avgCh, double devCh, double avgToken, double devToken, double avgTokenChar,
      double devTokenChar, Set<String> toSkip) {
    boolean sectionTitle = false, newLine = true, referencesSeen = false, anySectionSeen = false;
    StringBuffer sb = new StringBuffer();
    while (sb.length() == 0) {
      // TODO turn on mostPossibleTitle function, when felt necessary
      sb = new StringBuffer(/** mostPossibleTitle */
      (reducedLines.get(0)));
    }

    StringBuffer references = new StringBuffer();
    StringBuffer titleAndAbstract = new StringBuffer(sb.toString() + "\r\n");
    for (int i = 1; i < reducedLines.size(); ++i) {
      String line = reducedLines.get(i);
      if (toSkip.contains(line) || line.length() == 0)
        continue;
      // int tokensInLine = line.split("\\s+").length;
      // double avgTokenLength = (double) (line.length() - tokensInLine) / tokensInLine;
      if (referencesSeen) {
        sb.append((line.matches("\\[?\\d+\\]?.*") ? "\r\n" : " ") + line);
        references.append((line.matches("\\[?\\d+\\]?.*") ? "\r\n" : " ") + line);
      } else if (line.trim().matches("[1-9]\\d?(\\.\\d+\\.?)*\\.?\\s+[A-Z][\\s:\\-A-Za-z\"]+")
          || (line.matches("\\s*([1-9]\\d?\\.?\\s+)?R(eferences?|EFERENCES?)"))) {
        // && reducedLines.get(i + 1).matches("\\s*\\[1\\].*")
        sb.append("\r\n" + line);
        sectionTitle = true;
        anySectionSeen = true;
        newLine = true;
        if (line.matches("(?i)\\s*(\\d+\\.?\\s+)?references?"))
          referencesSeen = true;
      } else if (line.matches(".*\\S[.?:]$")) {
        sb.append(line + "\r\n");
        // } else if (avgTokenLength > avgTokenChar - devTokenChar && tokensInLine > avgToken - devToken &&
        // line.length() > avgCh - devCh) {
      } else if (sectionTitle && Character.isUpperCase(line.charAt(0))) {
        sb.append(" " + line);
        sectionTitle = false;
      } else {
        sb.append((newLine ? "\r\n" : "") + line + " ");
        sectionTitle = false;
        newLine = false;
      }
      if (!anySectionSeen)
        titleAndAbstract.append(line + " ");
    }
    return new String[] { sb.toString(), references.toString(), titleAndAbstract.toString() };
  }

  protected boolean isUnnecessaryLine(String line) {
    String unnecessityIndicators = "(){}[]";
    Map<Character, List<Integer>> indicatorPositions = new HashMap<Character, List<Integer>>();
    // initialize
    for (int i = 0; i < unnecessityIndicators.length(); ++i)
      indicatorPositions.put(unnecessityIndicators.charAt(i), new LinkedList<Integer>());
    int numericTokens = 0, fromPrevToken = 0, parentheses = 0;
    line = line.replaceAll(" +", " ");
    String lineParts[] = line.split(" +");
    for (String linePart : lineParts) {
      numericTokens = NUM_PATTERN.matcher(linePart).matches() ? ++numericTokens : numericTokens;
      for (int c = 0; c < linePart.length(); ++c) {
        int indicatorPos;
        if ((indicatorPos = unnecessityIndicators.indexOf(linePart.charAt(c))) != -1) {
          char charAtC = linePart.charAt(c);
          int actualPos = fromPrevToken + c;
          if (charAtC == '[' && actualPos < line.length() - 1 && !Character.isDigit(line.charAt(actualPos + 1))) {
            indicatorPositions.get(unnecessityIndicators.charAt(indicatorPos)).add(actualPos);
            parentheses++;
          } else if (charAtC == ']' && actualPos > 0 && !Character.isDigit(line.charAt(actualPos - 1))) {
            indicatorPositions.get(unnecessityIndicators.charAt(indicatorPos)).add(actualPos);
            parentheses++;
          } else if (charAtC != ']' && charAtC != '[') {
            indicatorPositions.get(unnecessityIndicators.charAt(indicatorPos)).add(actualPos);
            parentheses++;
          }
        }
      }
      fromPrevToken += linePart.length() + 1;
    }
    return numericTokens > 0.5 * lineParts.length || (parentheses > 1 && isBadlyParenthesized(indicatorPositions));
  }

  private boolean isBadlyParenthesized(Map<Character, List<Integer>> indicatorPositions) {
    String openerChars = "{([";
    Set<Integer> openers = new TreeSet<Integer>();
    Set<Integer> closers = new TreeSet<Integer>();
    for (Entry<Character, List<Integer>> entry : indicatorPositions.entrySet()) {
      if (openerChars.indexOf(entry.getKey()) != -1) {
        openers.addAll(entry.getValue());
      } else {
        closers.addAll(entry.getValue());
      }
    }
    ArrayList<Integer> openerIndices = new ArrayList<Integer>(openers);
    ArrayList<Integer> closerIndices = new ArrayList<Integer>(closers);
    if (openerIndices.size() == 0 || closerIndices.size() == 0)
      return true;
    int visitedClosers = 0;
    for (; visitedClosers < closerIndices.size(); visitedClosers++) {
      if (closerIndices.get(visitedClosers) > openerIndices.get(0)) {
        if (visitedClosers > 1 || openerIndices.size() - closerIndices.size() + visitedClosers > 1) {
          return true;
        } else {
          break;
        }
      }
    }
    int oi = 0;
    for (; visitedClosers < closerIndices.size() && oi < openerIndices.size(); visitedClosers++) {
      // return true i.e.likely to be some kind of equation
      if (openerIndices.get(oi) > closerIndices.get(visitedClosers)
          || (++oi < openerIndices.size() && closerIndices.get(visitedClosers) > openerIndices.get(oi)))
        return true;
    }
    return false;
  }

  private Map<String, String> etalonKeyphrases(String type, String dir) {
    Map<String, String> map = new HashMap<String, String>();
    List<List<String>> answers = NLPUtils.readAsList(dir + "/" + type + ".final", " : ", m_encoding);
    for (List<String> line : answers) {
      StringBuffer keyphrases = new StringBuffer();
      if (line.size() > 1) {
        String[] phrases = line.get(1).split(",");
        for (String s : phrases) {
          String[] parts = s.split("\\+");
          for (String p : parts)
            keyphrases.append(p + "\r\n");
        }
      }
      map.put(line.get(0), keyphrases.toString());
    }
    return map;
  }

}