package hu.u_szeged.kpe.readers;
import hu.u_szeged.utils.NLPUtils;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeSet;
public class SemEvalReader extends ScientificReader {
/** Files having the value of DEFAULT_EXTENSION as their suffices will be processed only. */
private static final String DEFAULT_EXTENSION = ".txt.final";
private Map<String, String> readerKeyphrases = null;
private Map<String, String> authorKeyphrases = null;
protected void setDetails() {
fileType = DEFAULT_EXTENSION;
}
public List<DocumentData> getContent(String dir, String file) {
if (readerKeyphrases == null) {
readerKeyphrases = etalonKeyphrases("reader", dir);
authorKeyphrases = etalonKeyphrases("author", dir);
if (goldAnnotation) {
// readerKeyphrases.putAll(authorKeyphrases);
for (Entry<String, String> readers : readerKeyphrases.entrySet()) {
String authors = authorKeyphrases.get(readers.getKey());
if (authors.length() > 0) {
readers.setValue(readers.getValue() + "\r\n" + authors);
}
}
}
}
String fileId = file.replaceAll(".*[\\\\/](([CHIJ]-)?\\d+)" + fileType, "$1");
List<DocumentData> toReturn = new ArrayList<DocumentData>(1);
toReturn.add(new ContestDocumentData(readerKeyphrases.get(fileId), authorKeyphrases.get(fileId), file, this.getClass()));
return toReturn;
}
public String getText(String file, int numberWithinFile) {
List<String> lines = new LinkedList<>();
NLPUtils.readDocToCollection(file, lines);
List<String> reducedLines = new ArrayList<String>(lines.size());
List<Integer> lengths = new ArrayList<Integer>(lines.size());
List<Integer> tokens = new ArrayList<Integer>(lines.size());
List<Number> charPerToken = new ArrayList<Number>(lines.size());
int sum = 0, tokenSum = 0, i = 0, lineNum = 0;
for (; i < lines.size(); ++i) {
if (lines.get(i).trim().length() > 0) {
reducedLines.add(lines.get(i++));
break;
}
}
// Map<String, Integer> multipleLines = new HashMap<String, Integer>();
// Map<String, List<String>> differentNumericLineForms = new HashMap<String, List<String>>();
try {
while (!lines.get(i++).matches("(?i)\\s*abstract\\s*")) {
continue;
}
} catch (IndexOutOfBoundsException e) {
i = 1;
System.err.println("No abstract for file " + file);
}
boolean referencesSeen = false;
for (; i < lines.size(); ++i) {
String line = lines.get(i);
if (!referencesSeen && line.matches("(?i)(\\d+\\.?\\s+)?references?") && i + 1 < lines.size() && lines.get(i + 1).matches("\\[1\\].*"))
referencesSeen = true;
if (!isUnnecessaryLine(line)) {
lineNum++;
reducedLines.add(line);
int tokenNumber = line.split(" +").length;
if (!referencesSeen) {
// String modifiedLine = line.replaceAll("\\s*\\d+\\s*", "_NUM_");
// if (!line.endsWith(".") && !line.toLowerCase().startsWith("theorem")){
// if (!modifiedLine.equals(line)){
// List<String> differentFormsOfLine = differentNumericLineForms.get(modifiedLine);
// differentFormsOfLine = differentFormsOfLine == null ? new ArrayList<String>() :
// differentFormsOfLine;
// differentFormsOfLine.add(line);
// differentNumericLineForms.put(modifiedLine, differentFormsOfLine);
// }
// Integer val = multipleLines.get(modifiedLine);
// multipleLines.put(modifiedLine, val == null ? 1 : ++val);
// }
sum += line.length();
lengths.add(line.length());
tokenSum += tokenNumber;
tokens.add(tokenNumber);
charPerToken.add((double) (line.length() - tokenNumber) / tokenNumber);
}
}
}
Set<String> toSkip = new HashSet<String>();
// for (Entry<String, Integer> multipleLine : multipleLines.entrySet()){
// int timesOfOccurrence = multipleLine.getValue();
// if (timesOfOccurrence > 4){
// List<String> formsOfLine = differentNumericLineForms.get(multipleLine.getKey());
// System.err.println(timesOfOccurrence + "\t" + multipleLine);
// while (--timesOfOccurrence >= 0){
// String actualForm = formsOfLine == null ? multipleLine.getKey() :
// formsOfLine.get(timesOfOccurrence);
// toSkip.add(actualForm);
// sum -= actualForm.length();
// int tokenNumber = actualForm.trim().split(" +").length;
// tokenSum -= tokenNumber;
// lengths.remove((Integer) actualForm.length());
// tokens.remove((Integer) tokenNumber);
// charPerToken.remove((double) (actualForm.length() - tokenNumber) / tokenNumber);
// }
// }
// }
double avgCharPerToken = NLPUtils.mean(charPerToken), devCharPerToken = 0.0;
double devChars = 1.0, avgChars = (double) sum / lengths.size();
double devTokens = 0.0, avgTokens = (double) tokenSum / tokens.size();
for (int index = 0; index < lengths.size(); ++index) {
devChars += Math.pow(avgChars - lengths.get(index), 2);
devTokens += Math.pow(avgTokens - tokens.get(index), 2);
devCharPerToken += Math.pow(avgCharPerToken - charPerToken.get(index).doubleValue(), 2);
}
String[] article = processText(reducedLines, avgChars, Math.sqrt(devChars / lineNum), avgTokens, Math.sqrt(devTokens / tokens.size()), avgCharPerToken,
Math.sqrt(devCharPerToken / charPerToken.size()), toSkip);
// List<String> sentences = new ArrayList<String>();
// sentences.add(article[0]);
// String allKeyphrases = "";
/*
* Set<String> phrases = new TreeSet<String>(); for (Entry<String, String> kphs : authorKeyphrases.entrySet()){ if (kphs.getKey().charAt(0) !=
* fileId.charAt(0)) continue; String[] kphArray = kphs.getValue().split("\r*\n"); for (String kph : kphArray)
* phrases.add(kph.toLowerCase().trim()); } for (String phrase : phrases)
*/
// for (String phrase : authorKeyphrases.values())
// if (phrase.replaceAll("\\s+", " ").length() != 1)
// allKeyphrases += phrase + "\r\n";
return article[0];
}
protected String[] processText(List<String> reducedLines, double avgCh, double devCh, double avgToken, double devToken, double avgTokenChar,
double devTokenChar, Set<String> toSkip) {
boolean sectionTitle = false, newLine = true, referencesSeen = false, anySectionSeen = false;
StringBuffer sb = new StringBuffer();
while (sb.length() == 0) {
// TODO turn on mostPossibleTitle function, when felt necessary
sb = new StringBuffer(/** mostPossibleTitle */
(reducedLines.get(0)));
}
StringBuffer references = new StringBuffer();
StringBuffer titleAndAbstract = new StringBuffer(sb.toString() + "\r\n");
for (int i = 1; i < reducedLines.size(); ++i) {
String line = reducedLines.get(i);
if (toSkip.contains(line) || line.length() == 0)
continue;
// int tokensInLine = line.split("\\s+").length;
// double avgTokenLength = (double) (line.length() - tokensInLine) / tokensInLine;
if (referencesSeen) {
sb.append((line.matches("\\[?\\d+\\]?.*") ? "\r\n" : " ") + line);
references.append((line.matches("\\[?\\d+\\]?.*") ? "\r\n" : " ") + line);
} else if (line.trim().matches("[1-9]\\d?(\\.\\d+\\.?)*\\.?\\s+[A-Z][\\s:\\-A-Za-z\"]+")
|| (line.matches("\\s*([1-9]\\d?\\.?\\s+)?R(eferences?|EFERENCES?)"))) {
// && reducedLines.get(i + 1).matches("\\s*\\[1\\].*")
sb.append("\r\n" + line);
sectionTitle = true;
anySectionSeen = true;
newLine = true;
if (line.matches("(?i)\\s*(\\d+\\.?\\s+)?references?"))
referencesSeen = true;
} else if (line.matches(".*\\S[.?:]$")) {
sb.append(line + "\r\n");
// } else if (avgTokenLength > avgTokenChar - devTokenChar && tokensInLine > avgToken - devToken &&
// line.length() > avgCh - devCh) {
} else if (sectionTitle && Character.isUpperCase(line.charAt(0))) {
sb.append(" " + line);
sectionTitle = false;
} else {
sb.append((newLine ? "\r\n" : "") + line + " ");
sectionTitle = false;
newLine = false;
}
if (!anySectionSeen)
titleAndAbstract.append(line + " ");
}
return new String[] { sb.toString(), references.toString(), titleAndAbstract.toString() };
}
protected boolean isUnnecessaryLine(String line) {
String unnecessityIndicators = "(){}[]";
Map<Character, List<Integer>> indicatorPositions = new HashMap<Character, List<Integer>>();
// initialize
for (int i = 0; i < unnecessityIndicators.length(); ++i)
indicatorPositions.put(unnecessityIndicators.charAt(i), new LinkedList<Integer>());
int numericTokens = 0, fromPrevToken = 0, parentheses = 0;
line = line.replaceAll(" +", " ");
String lineParts[] = line.split(" +");
for (String linePart : lineParts) {
numericTokens = NUM_PATTERN.matcher(linePart).matches() ? ++numericTokens : numericTokens;
for (int c = 0; c < linePart.length(); ++c) {
int indicatorPos;
if ((indicatorPos = unnecessityIndicators.indexOf(linePart.charAt(c))) != -1) {
char charAtC = linePart.charAt(c);
int actualPos = fromPrevToken + c;
if (charAtC == '[' && actualPos < line.length() - 1 && !Character.isDigit(line.charAt(actualPos + 1))) {
indicatorPositions.get(unnecessityIndicators.charAt(indicatorPos)).add(actualPos);
parentheses++;
} else if (charAtC == ']' && actualPos > 0 && !Character.isDigit(line.charAt(actualPos - 1))) {
indicatorPositions.get(unnecessityIndicators.charAt(indicatorPos)).add(actualPos);
parentheses++;
} else if (charAtC != ']' && charAtC != '[') {
indicatorPositions.get(unnecessityIndicators.charAt(indicatorPos)).add(actualPos);
parentheses++;
}
}
}
fromPrevToken += linePart.length() + 1;
}
return numericTokens > 0.5 * lineParts.length || (parentheses > 1 && isBadlyParenthesized(indicatorPositions));
}
private boolean isBadlyParenthesized(Map<Character, List<Integer>> indicatorPositions) {
String openerChars = "{([";
Set<Integer> openers = new TreeSet<Integer>();
Set<Integer> closers = new TreeSet<Integer>();
for (Entry<Character, List<Integer>> entry : indicatorPositions.entrySet()) {
if (openerChars.indexOf(entry.getKey()) != -1) {
openers.addAll(entry.getValue());
} else {
closers.addAll(entry.getValue());
}
}
ArrayList<Integer> openerIndices = new ArrayList<Integer>(openers);
ArrayList<Integer> closerIndices = new ArrayList<Integer>(closers);
if (openerIndices.size() == 0 || closerIndices.size() == 0)
return true;
int visitedClosers = 0;
for (; visitedClosers < closerIndices.size(); visitedClosers++) {
if (closerIndices.get(visitedClosers) > openerIndices.get(0)) {
if (visitedClosers > 1 || openerIndices.size() - closerIndices.size() + visitedClosers > 1) {
return true;
} else {
break;
}
}
}
int oi = 0;
for (; visitedClosers < closerIndices.size() && oi < openerIndices.size(); visitedClosers++) {
// return true i.e.likely to be some kind of equation
if (openerIndices.get(oi) > closerIndices.get(visitedClosers)
|| (++oi < openerIndices.size() && closerIndices.get(visitedClosers) > openerIndices.get(oi)))
return true;
}
return false;
}
private Map<String, String> etalonKeyphrases(String type, String dir) {
Map<String, String> map = new HashMap<String, String>();
List<List<String>> answers = NLPUtils.readAsList(dir + "/" + type + ".final", " : ", m_encoding);
for (List<String> line : answers) {
StringBuffer keyphrases = new StringBuffer();
if (line.size() > 1) {
String[] phrases = line.get(1).split(",");
for (String s : phrases) {
String[] parts = s.split("\\+");
for (String p : parts)
keyphrases.append(p + "\r\n");
}
}
map.put(line.get(0), keyphrases.toString());
}
return map;
}
}