package hu.u_szeged.kpe.readers;
import hu.u_szeged.kpe.candidates.NGram;
import hu.u_szeged.utils.NLPUtils;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.ObjectInputStream;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import edu.stanford.nlp.ling.CoreAnnotations.NamedEntityTagAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations.TokensAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.MweDictAnnotator.MWEAnnotation;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
public class DocumentData implements Comparable<DocumentData>, Serializable {
private static final long serialVersionUID = -8144005167022088407L;
/** The total number of DocumentData objects initialized */
private static int totalDocuments;
/** Identifier of the document */
private int documentId;
/** Location of the document */
private String file;
/** Stores ordinal number of the document within the file */
private int lineNumInFile;
/** Keyphrases of the document */
private Map<NGram, Integer> etalonKeyphrases;
/** Acronyms of the document */
private Map<String, Integer> acronyms;
/** Mapping between the formatted strings of the document and their actual formatting */
protected Map<NGram, Set<String>> formattedStrings;
/** This can be useful to define reader specific behavior */
private String documentType;
public DocumentData(String keyph, String fileName, Class<?> docType) {
documentId = totalDocuments++;
etalonKeyphrases = transformKeyphrases(keyph);
file = fileName;
documentType = docType.getSimpleName().replace("Reader", "");
}
public int getDocId() {
return documentId;
}
public void setDocId(int id) {
documentId = id;
}
public Map<NGram, Set<String>> getFormattedStrings() {
return formattedStrings;
}
public Map<NGram, Integer> getKeyphrases() {
return etalonKeyphrases;
}
public void setKeyphrases(String keyph) {
etalonKeyphrases = transformKeyphrases(keyph);
}
public String getFile() {
return file;
}
public void setFile(String file) {
this.file = file;
}
public Map<String, Integer> getAcronyms() {
return acronyms;
}
public void setAcronyms(Map<String, Integer> acr) {
acronyms = acr;
}
public boolean isFormatted(NGram phraseBuffer) {
return formattedStrings != null && formattedStrings.containsKey(phraseBuffer);
}
// public boolean isFormatted(CoreLabel ew) {
// return formattedStringParts != null && formattedStringParts.contains(ew);
// }
/**
* Gets all the phrases in the given string and puts them into a map with its occurrences.
*/
public Map<NGram, Integer> transformKeyphrases(String keyphrases) {
HashMap<NGram, Integer> hash = new HashMap<NGram, Integer>();
if (keyphrases == null || keyphrases.length() == 0)
return hash;
for (String tok : keyphrases.split("(\r?\n)+")) {
tok = tok.trim();
if (tok.length() == 0) {
continue;
}
String newTok;
if (tok.equalsIgnoreCase("c++")) {
newTok = tok;
} else if (tok.toLowerCase().startsWith(".net")) {
newTok = tok;
} else {
newTok = tok.replaceAll("^\\p{Punct}|\\p{Punct}$", "");
}
if (newTok.length() < tok.length()) {
System.err.println("Etalon phrase " + tok + " transformed into " + newTok);
}
Annotation annotatedContent = new Annotation(newTok);
KpeReader.sentenceAnalyzer.annotate(annotatedContent);
NGram id = new NGram(annotatedContent.get(TokensAnnotation.class));
Integer value = hash.get(id);
hash.put(id, value == null ? 1 : ++value);
}
return hash;
}
public int getLineNumInFile() {
return lineNumInFile;
}
public void setLineNumInFile(int lineNum) {
lineNumInFile = lineNum;
}
public String toString() {
return documentId + "\t" + file;
}
public int compareTo(DocumentData dd) {
int fileComparison = file.compareTo(dd.getFile());
return fileComparison == 0 ? (lineNumInFile < dd.getLineNumInFile() ? -1 : 1) : fileComparison;
}
public boolean equals(Object o) {
if (!(o instanceof DocumentData)) {
return false;
}
return file.equals(((DocumentData) o).getFile()) && ((DocumentData) o).getLineNumInFile() == lineNumInFile;
}
public int hashCode() {
return (file + "_" + lineNumInFile).hashCode();
}
public boolean containsReference(CoreMap sentence) {
List<CoreLabel> sentenceTokens = sentence.get(TokensAnnotation.class);
if (documentType.matches("(?i)semeval|scientific")) {
nobracket: for (int i = 0; i < sentenceTokens.size(); ++i) {
if (sentenceTokens.get(i).word().equals("-LRB-")) {
while (++i < sentenceTokens.size()) {
CoreLabel nextToken = sentenceTokens.get(i);
if (!nextToken.word().matches("(,|\\d+|[A-Z&a-z]+\\d{2}|-RRB-)")) {
continue nobracket;
} else if (nextToken.word().equals("-RRB-")) {
return true;
}
}
}
}
}
return false;
}
public boolean isScientific() {
return documentType.matches("(?i)semeval|scientific");
}
public TreeMap<Integer, List<CoreMap>> getSections(KpeReader reader, boolean serialize) {
TreeMap<Integer, List<CoreMap>> sectionsWithSentences = new TreeMap<Integer, List<CoreMap>>();
List<Annotation> sections = tagAndParse(reader, serialize);
Iterator<Annotation> sectionIter = sections.iterator();
while (sectionIter.hasNext()) {
Annotation sectionAnn = sectionIter.next();
List<CoreMap> sentencesOfSection = sectionAnn.get(SentencesAnnotation.class);
sectionsWithSentences.put(sectionsWithSentences.size(), sentencesOfSection);
}
return sectionsWithSentences;
}
/**
* Checks for the presence of some critical annotations. In the case some of those entered among the
* parameters is missing, the texts needs to be re-annotated.
*
* @param a
* annotation
* @param r
* reader with the desired annotations
* @return
*/
private boolean needsReannotation(Annotation a, KpeReader r) {
List<CoreMap> sentences = a.get(SentencesAnnotation.class);
List<CoreLabel> tokens = a.get(TokensAnnotation.class);
if (tokens == null || sentences == null || tokens.size() == 0 || sentences.size() == 0) {
return true;
}
Set<Class<?>> sentenceAnnotations = sentences.get(0).keySet();
Set<Class<?>> tokenAnnotations = tokens.get(0).keySet();
if ((r.getIsMweOn() && !tokenAnnotations.contains(MWEAnnotation.class))
|| (r.getIsNeOn() && !tokenAnnotations.contains(NamedEntityTagAnnotation.class))) {
return true;
}
if (r.getIsSyntaxOn() && !sentenceAnnotations.contains(TreeAnnotation.class)) {
return true;
}
return false;
}
@SuppressWarnings("unchecked")
private List<Annotation> tagAndParse(KpeReader reader, boolean serialize) {
int numberInDoc = getLineNumInFile();
File f = new File(file);
String grammarFile = f.getParent() + "/grammar/" + (numberInDoc > 0 ? numberInDoc : "") + f.getName() + ".gr";
if (new File(grammarFile).exists()) {
try {
ObjectInputStream in = new ObjectInputStream(new BufferedInputStream(new FileInputStream(grammarFile)));
List<Annotation> documentSections = (List<Annotation>) in.readObject();
in.close();
if (documentSections.size() == 0 || needsReannotation(documentSections.get(0), reader)) {
analyzeSections(documentSections, grammarFile, serialize);
}
return documentSections;
} catch (Exception e) {
System.err.println("Error with the serialized grammar file " + grammarFile + "\n" + e);
}
}
// text = text.replaceAll("(.)\\1{4,}", "$1");
// List<String> sectionsOfText = determineSections(text);
List<String> paragraphs = determineSections(reader.getText(file, lineNumInFile));
List<Annotation> documentSections = new ArrayList<Annotation>(paragraphs.size());
try {
System.err.println(file + " is to be analysed...");
for (String section : paragraphs) {
// just some ugly hack to get over such expressions as inequalities that would affect the tokenizer to
// make dull things
if (isScientific()) {
int originalLength = section.length();
section = section.replaceAll("<([\\S&&[^>]]+) +", "< $1 "); // replaceAll("([<>])(\\S+)", "$1 $2");
if (originalLength - section.length() < 0) {
System.err.println("Type-1 scientific document heuristic was applied for " + file);
}
// get rid of hyphens as well that might get into the text unintentionally
originalLength = section.length();
section = section.replaceAll("([a-z0-9])-\\s+([a-z0-9])", "$1$2");
if (originalLength - section.length() > 0) {
System.err.println("Type-2 scientific document heuristic was applied for " + file);
}
}
documentSections.add(new Annotation(section));
}
analyzeSections(documentSections, grammarFile, serialize);
} catch (Exception e) {
System.err.println("Error occured during the annotation of file " + file + " of line " + lineNumInFile);
e.printStackTrace();
}
return documentSections;
}
private void analyzeSections(List<Annotation> documentSections, String grammarFile, boolean serialize) {
for (Annotation ann : documentSections) {
KpeReader.sentenceAnalyzer.annotate(ann);
}
if (serialize && file != null) {
NLPUtils.serialize(documentSections, grammarFile);
}
}
/**
* @param text
* @return the List of sections of the document
*/
private List<String> determineSections(String text) {
// In the simplest case the whole text is handled as one section.
List<String> sections = new ArrayList<String>(1);
sections.add(text);
return sections;
}
}