package hu.u_szeged.kpe.readers;
import hu.u_szeged.utils.NLPUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.commons.lang3.StringEscapeUtils;
import edu.stanford.nlp.ling.CoreAnnotations.SentencesAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.trees.TreeCoreAnnotations.TreeAnnotation;
import edu.stanford.nlp.util.CoreMap;
public class EpinionReader extends KpeReader {
private static final String DEFAULT_EXTENSION = ".xml";
private static final Pattern triggerCodes = Pattern.compile("(?i)JJ[RS]?|NN.{0,2}|VB.?|(ADJ|N|V)P");
private static Pattern targetWords = Pattern
.compile("(?i)ha(rd|ve|s)|difficult|like|love|w(ill|o)|comes|miss(es|ing)?|can|may|might|be(en)?|['i]s|are|include(d|s)?|lack(s|ed|ing)?");
private static Map<String, List<String[]>> opinionAnnotations;
// public static PrintWriter out;
protected void setDetails() {
fileType = DEFAULT_EXTENSION;
m_encoding = Charset.forName("UTF-8");
}
private void setEtalonPhrases(String dir) {
if (opinionAnnotations != null) {
return;
}
opinionAnnotations = new HashMap<String, List<String[]>>();
for (File f : new File(dir + "/keyphrases/").listFiles()) {
for (Entry<String, List<String[]>> annotation : readInNotation(f.getAbsolutePath()).entrySet()) {
opinionAnnotations.put(annotation.getKey(), annotation.getValue());
}
}
}
private Map<String, List<String[]>> readInNotation(String file) {
Map<String, List<String[]>> reviewAnnotations = new HashMap<String, List<String[]>>();
List<List<String>> lines = NLPUtils.readAsList(file, "\t", Charset.forName("Cp1250"));
String reviewId = "";
for (List<String> l : lines) {
if (l.size() > 1 && l.get(1).startsWith("http://www10.epinions.com/review/")) {
reviewAnnotations.put((reviewId = l.get(1).replaceAll("http://www10.epinions.com/review/(.*)", "$1")),
new LinkedList<String[]>());
} else if (l.size() > 2 && !l.get(1).equals("AUTHOR_OPINIONS:")) {
reviewAnnotations.get(reviewId).add(new String[] { l.get(2), "" });
// update with automatic keyphrases if possible
if (l.size() > 3 && l.get(3).length() > 0) {
List<String[]> opinions = reviewAnnotations.get(reviewId);
opinions.get(opinions.size() - 1)[1] = l.get(3);
}
}
}
return reviewAnnotations;
}
private Tree getFirstInterestingSubTree(Tree original, Tree tree) {
if (tree == null) {
return null;
}
String label = tree.label().toString();
if (triggerCodes.matcher(label).matches() && !label.matches("VB.?")) {
return tree;
} else if (!tree.isLeaf()) {
Tree originalTree = tree.deepCopy();
List<Tree> siblings = tree.siblings(original);
if (siblings != null) {
for (Tree sibling : siblings) {
return getFirstInterestingSubTree(originalTree, sibling);
}
}
for (Tree child : originalTree.children()) {
return getFirstInterestingSubTree(originalTree, child);
}
}
return null;
}
private Tree getDependantPhrase(Tree tree) {
List<Tree> leaves = tree.getLeaves();
Tree targetNode = null, interestingSubTree = null;
leaves: for (Tree leaf : leaves) {
if (targetWords.matcher(leaf.toString()).matches()) {
targetNode = leaf;
continue;
} else if (targetNode == null) {
continue;
}
for (int depth = 1; depth < tree.depth(leaf); ++depth) {
Tree ancestor = leaf.ancestor(depth, tree);
if (ancestor.dominates(targetNode)) {
Tree[] children = ancestor.children();
for (int c = 0; c < children.length; ++c) {
List<Tree> childLeaves = children[c].getLeaves();
for (int lc = 0; lc < childLeaves.size(); ++lc) {
if (targetWords.matcher(childLeaves.get(lc).toString()).matches()) {
if (++c < children.length) {
if (ancestor.getChild(c).getLeaves().get(0).toString().matches("(?i)n't|not")
&& c + 1 < children.length) {
c++;
}
}
interestingSubTree = ancestor.getChild(c);
break leaves;
}
}
}
}
}
}
return getFirstInterestingSubTree(interestingSubTree, interestingSubTree);
}
private Tree getPreviousPhrase(Tree tree, String phraseType) {
List<Tree> leaves = tree.getLeaves();
Tree targetNode = null, interestingSubTree = null;
for (Tree leaf : leaves) {
if (targetWords.matcher(leaf.toString()).matches()) {
targetNode = leaf;
break;
}
}
leaves: for (int l = 0; l < leaves.size(); ++l) {
Tree leaf = leaves.get(l);
if (targetWords.matcher(leaf.toString()).matches()) {
break;
}
for (int depth = 1; depth < tree.depth(leaf); ++depth) {
Tree ancestor = leaf.ancestor(depth, tree);
if (ancestor.dominates(targetNode)) {
continue leaves;
}
if (ancestor.label().toString().matches(phraseType)) {
interestingSubTree = ancestor;
}
}
}
return interestingSubTree;
}
private String combinePhrases(Tree parsing) {
StringBuffer phrase = new StringBuffer();
Tree preceding = getPreviousPhrase(parsing, "NP");
Tree following = getDependantPhrase(parsing);
List<String> leaves = getPosLeaves(preceding);
for (String goodLeaf : leaves) {
phrase.append(goodLeaf.toString().toLowerCase() + " ");
}
// if (pos.equals("JJ"))
// containedAdjective = true;
int offset = 0;
leaves = getPosLeaves(following);
for (String goodLeaf : leaves) {
if (!phrase.toString().contains(goodLeaf.toLowerCase())) {
phrase.insert(offset, goodLeaf + " ");
offset += goodLeaf.length() + 1;
}
}
return phrase.toString().trim();
}
private List<String> retrieveOpinions(List<String> opinionUnits) {
List<String> opinions = new LinkedList<String>();
for (String op : opinionUnits) {
if (op.length() == 0) {
continue;
}
op = op.replaceAll("(^|\\s+)[^\\p{Alnum}.,\\-]+|[^\\p{Alnum}.,\\-]+(\\s+|$)", " ").replaceAll(" while .*", "")
.trim();
if (op.matches(".*[^\\s]/[^\\s][^/]*")) {
String[] parts = op.split("/");
String prefix = parts[0].replaceAll("(.*\\s)[^\\s]+", "$1");
if (!parts[0].contains(" ")) {
String postfix = parts[1].replaceAll("[^\\s]+(\\s.*)", "$1");
parts[1] = parts[1].split("\\s")[0];
if (parts[1].equals(postfix)) {
opinions.add(prefix);
opinions.add(postfix);
continue;
}
for (String part : parts) {
opinions.add(part + postfix);
}
continue;
}
parts[0] = parts[0].split("\\s")[parts[0].split("\\s").length - 1];
for (String part : parts) {
opinions.add(prefix + part);
}
continue;
}
opinions.add(op);
}
return opinions;
}
private List<String> getPosLeaves(Tree t) {
return getPosLeaves(t, triggerCodes);
}
private List<String> getPosLeaves(Tree t, String pattern) {
Pattern p = Pattern.compile(".*");
try {
p = Pattern.compile(pattern);
} catch (PatternSyntaxException e) {
System.err.println("Malformed regexp.\nNOTE: all leaves will be accepted.");
}
return getPosLeaves(t, p);
}
private List<String> getPosLeaves(Tree t, Pattern pos) {
List<String> list = new LinkedList<String>();
if (t == null) {
return list;
}
List<Tree> leaves = t.getLeaves();
for (Tree leaf : leaves) {
if (pos.matcher(leaf.ancestor(1, t).label().toString()).matches()) {
list.add(leaf.toString());
}
}
return list;
}
private List<String> segmentateOpinions(String line) {
List<String> opinionSegments = new ArrayList<String>(Arrays.asList(line
.split("\\s+with\\s+|\t|\\s*[,.:;?!&\\-]+(\\s+|$)")));
segments: for (int i = 0; i < opinionSegments.size(); ++i) {
String op = opinionSegments.get(i);
if (op.matches("(?i).+\\s+and\\s+.*")) {
Annotation document = new Annotation(op);
sentenceAnalyzer.annotate(document);
Tree parseTree = document.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class);
List<Tree> leaves = parseTree.getLeaves();
for (Tree leaf : leaves) {
if (leaf.toString().matches("and|but|because")) {
opinionSegments.remove(i);
for (String part : getGeneratedStructures(parseTree, leaf, op)) {
opinionSegments.add(i++, part);
}
continue segments;
}
}
} else if (op.matches("(?i)(and|but|because( of)?)\\s+.*")) {
opinionSegments.set(i, op.replaceAll("(?i)^(and|but|because( of)?)\\s+", ""));
} else if (op.matches("(?i).*\\s+(but|because( of)?)\\s+.*")) {
opinionSegments.remove(i);
String[] parts = op.split("(?i)\\s+(but|because( of)?)\\s+");
for (String part : parts) {
opinionSegments.add(i++, part);
}
}
}
return opinionSegments;
}
private String conCat(Collection<String> container) {
String result = "";
for (String element : container) {
result += element + " ";
}
return result.trim();
}
private List<String> getGeneratedStructures(Tree parse, Tree leaf, String op) {
Tree anc = leaf.ancestor(3, parse);
if (anc.label().toString().equals("ROOT")) {
return Arrays.asList(op.split(" and "));
}
Tree[] children = leaf.ancestor(2, parse).children();
if (children.length == 1) {
return Arrays.asList(op.split(" and "));
} else if (children.length != 3) {
List<List<String>> leafStrings = new ArrayList<List<String>>(2);
leafStrings.add(new LinkedList<String>());
for (Tree child : children) {
if (child.label().toString().equals("CC") && child.getLeaves().get(0).label().toString().equals("and")) {
leafStrings.add(new LinkedList<String>());
continue;
}
leafStrings.get(leafStrings.size() - 1).addAll(getPosLeaves(child, ".*"));
}
if (leafStrings.size() > 2 || leafStrings.get(0).size() > 0
|| (leafStrings.size() > 1 && leafStrings.get(1).size() > 0)) {
return Arrays.asList(op.split(" and "));
}
String[] leftAndRight = { conCat(leafStrings.get(0)), conCat(leafStrings.get(1)) };
Tree[] trees = new Tree[2];
int i = 0;
for (String side : leftAndRight) {
Annotation ann = new Annotation(side);
sentenceAnalyzer.annotate(ann);
trees[i++] = ann.get(SentencesAnnotation.class).get(0).get(TreeAnnotation.class);
}
return produceNewExpressions(parse, trees[0], trees[1], op);
}
return produceNewExpressions(parse, children[0], children[2], op);
}
private List<String> produceNewExpressions(Tree full, Tree left, Tree right, String op) {
String[] subTreeLabels = { left.label().toString(), right.label().toString() };
StringBuffer lText = new StringBuffer(conCat(getPosLeaves(left, ".*")));
StringBuffer rText = new StringBuffer(conCat(getPosLeaves(right, ".*")));
StringBuffer beginning = new StringBuffer(op.replaceAll(lText + ".*", ""));
if (subTreeLabels[0].matches("JJ.?|ADJP") && subTreeLabels[1].matches("N(N.{0,2}|P)")) {
String[] parts = op.split(" and ");
parts[0] += " " + conCat(getPosLeaves(right, "NN.{0,2}"));
return Arrays.asList(parts);
} else if (subTreeLabels[0].matches("N(N.{0,2}|P)") && subTreeLabels[1].matches("N(N.{0,2}|P)")) {
return Arrays.asList(op.split(" and "));
} else if (subTreeLabels[0].matches("V(P|B.{0,2})") && subTreeLabels[1].matches("V(P|B.{0,2})"))
return Arrays.asList(new String[] { beginning + lText.toString(), beginning + rText.toString() });
return Arrays.asList(op.split(" and "));
}
private String numerateOpinions(String line) {
StringBuffer opinion = new StringBuffer();
List<String> opinions = retrieveOpinions(segmentateOpinions(line.replaceAll("\\([^)]+\\)", "")));
for (String op : opinions) {
op = op.replaceAll(".* ability to (.*)", "$1").replaceAll("([^\\s])'s ", "$1 's ").replaceAll("^\\s*-", "");
String generatedPhrase = "";
if (op.trim().length() == 0) {
continue;
}
Annotation annotatedOp = new Annotation(op);
sentenceAnalyzer.annotate(annotatedOp);
for (CoreMap sentence : annotatedOp.get(SentencesAnnotation.class)) {
Tree parsing = sentence.get(TreeAnnotation.class);
if (op.split(" ").length == 1) {
generatedPhrase = op;
} else if (!op.matches("(?i).* missing calls.*")
&& op
.matches("(?i)(^|.+ )((difficult|hard) to|like|love|comes with|miss(es|ing)?|w(ill|on't)|ha(ve|s)|can|may|might|be(en)?|(it)?'s|(is|are)(n't)?|include(d|s)?|lack(s|ed|ing)?) .*")) {
generatedPhrase = combinePhrases(parsing);
} else if (op.matches("(?i)(.* )?(not )?as .* as .*")) {
String np = op.replaceAll("(?i)(.* )?(not )?as (.*) as .*", "$1").replaceAll("(?i) not", "").trim();
String adjp = op.replaceAll("(?i)(.* )?(not )?as (.*) as .*", "$3").trim();
if (np.equalsIgnoreCase("not")) {
generatedPhrase = "not " + adjp;
} else if (op.matches("(?i).*not as.*")) {
generatedPhrase = "not " + adjp + " " + np;
} else {
generatedPhrase = "not " + adjp;
}
} else if (op.matches("(?i)\\s*not? [^\\s]+")) {
generatedPhrase = "not " + op.substring(op.indexOf(" ") + 1);
} else if (op.split(" ").length > 3) {
String[] generations = { "", "" }; // first element is for NP, second is for ADJP
boolean[] foundPhrase = { false, false };
List<Tree> leaves = parsing.getLeaves();
for (Tree leaf : leaves) {
for (int d = 2; d < parsing.depth(leaf) && (!foundPhrase[0] || !foundPhrase[1]); ++d) {
Tree ancestor = leaf.ancestor(d, parsing);
if (!foundPhrase[0] && ancestor.label().toString().equals("NP")) {
for (String goodLeaf : getPosLeaves(ancestor)) {
generations[0] += goodLeaf + " ";
}
foundPhrase[0] = generations[0].length() > 0;
} else if (!foundPhrase[1] && ancestor.label().toString().equals("ADJP")) {
for (String goodLeaf : getPosLeaves(ancestor)) {
generations[1] += goodLeaf + " ";
}
foundPhrase[1] = generations[1].length() > 0;
}
}
}
generatedPhrase = generations[0].length() != 0 ? generations[0] : generations[1];
} else {
List<Tree> leaves = parsing.getLeaves();
for (Tree leaf : leaves) {
if (leaf.ancestor(1, parsing).label().toString().matches("NN.{0,2}|JJ.?|VB.?|TO")) {
generatedPhrase += leaf + " ";
}
}
}
generatedPhrase = generatedPhrase.trim();
if (op
.matches("(?i)(.* |^)(not?|(wo|will|do(es)?|is|are)(n't| not)|lack(s|ing|ed)?|(c|sh)ould (be|ha(ve|s|d) been)) .*")
&& !generatedPhrase.matches("(?i)not .+")) {
generatedPhrase = "not " + generatedPhrase.replaceAll("lack(s|ing|ed) ", "");
}
// if (generatedPhrase.length() > 0)
// System.out.println(generatedPhrase + "\t" + op);
// out.println(op + "\t" + generatedPhrase + "\t" + generatedPhrase);
opinion.append(generatedPhrase.length() > 0 ? generatedPhrase + "\r\n" : "");
}
}
return opinion.toString();
}
public List<DocumentData> getContent(String dir, String file) {
int reviewNumberInFile = 0;
setEtalonPhrases(dir);
List<DocumentData> toReturn = new LinkedList<DocumentData>();
// if (fileType.equals(".txt") && !file.contains("Readme")){
// Collection<String> lines = NLPUtils.readDocToCollection(file, new LinkedList<String>());
// boolean beginningMet = false, title = false;
// StringBuffer sentences = new StringBuffer();
// for (String line : lines){
// title = false;
// if (line.startsWith("[t]")){
// beginningMet = true;
// title = true;
// if (sentences.length() > 0){
// toReturn.add(new DocumentData(sentences, "keyphrases", file, toReturn.size(),
// this.getClass()));
// }
// sentences = new StringBuffer();
// }
// if (beginningMet){
// sentences.append(line.substring(line.indexOf(title ? "[t]" : "##") + (title ? 3 : 2)).trim()
// + "\r\n");
// }
// }
// toReturn.add(new DocumentData(sentences, "keyphrases", file, toReturn.size(),
// this.getClass()));
// return toReturn;
// }
try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), getEncoding()))) {
String line, urlPart = "", reviewId = "", opinionsOfReview = "";
while ((line = transformLine(br.readLine())) != null) {
if (line.startsWith("<product name=")) {
urlPart = line.replaceAll("<product name=\"(.*)\" time=\".*\">", "$1");
} else if (line.startsWith("<review urlID=")) {
reviewId = line.replaceAll("<review urlID=\"(.*)\">", "$1");
List<String[]> opinionList = opinionAnnotations.get(urlPart + "/" + reviewId);
if (opinionList != null) {
StringBuffer tempBuffer = new StringBuffer();
for (String[] op : opinionList) {
tempBuffer.append(op[goldAnnotation ? 0 : 1] + '\n');
}
opinionsOfReview = tempBuffer.toString().trim();
}
}
if (line.equals("</review>")) {
reviewNumberInFile++;
if (opinionsOfReview.length() > 0) {
DocumentData dd = new DocumentData(opinionsOfReview, file, this.getClass());
dd.setLineNumInFile(reviewNumberInFile);
toReturn.add(dd);
}
}
// TODO if this were not a pilot project the upcoming part should not be commented
// else if ((line.equals("<pros>") || line.equals("<cons>"))) {
// String lineWithOpinions = transformLine(br.readLine());
// if (lineWithOpinions == null)
// continue;
// for (String opinion : numerateOpinions(lineWithOpinions).split("(\\r?\\n)+")) {
// Annotation annotatedContent = new Annotation(opinion);
// ngramproc.process(annotatedContent);
// NGram ng = new NGram(annotatedContent.get(TokensAnnotation.class));
// Integer value = opinionsOfReview.get(ng);
// opinionsOfReview.put(ng, value == null ? 1 : ++value);
// }
// // out.println("OPINIONS:\t" + lineWithOpinions);
}
return toReturn;
} catch (IOException io) {
io.printStackTrace();
return null;
}
}
private String transformLine(String line) {
if (line == null) {
return null;
} else if (line.equals("null")) {
return "";
}
return StringEscapeUtils.unescapeHtml4(line).replaceAll("(?i)\\s+w/o\\s*", " without ")
.replaceAll("(?i)\\s+w/\\s*", " with ").trim();
}
// public TreeMap<Integer, List<CoreMap>> sectionMapping(DocumentData doc) {
// TreeMap<Integer, List<CoreMap>> documentSections = new TreeMap<Integer, List<CoreMap>>();
// int sectionNumber = 0;
// documentSections.put(sectionNumber, new LinkedList<CoreMap>());
// Annotation docAnnotation = tagAndParse(doc);
// List<CoreMap> sentences = docAnnotation.get(SentencesAnnotation.class);
// List<CoreLabel> tokens = docAnnotation.get(TokensAnnotation.class);
// int tokensPassed = 0, nlOffset = 0;
// boolean paragraph = false;
// for (CoreMap sentence : sentences){
// List<CoreLabel> sentTokens = sentence.get(TokensAnnotation.class);
// tokensPassed += sentTokens.size();
//
// if (sentTokens.size() == 1 && sentTokens.get(0).word().equals("null")){
// continue;
// }
// while (tokensPassed + nlOffset < tokens.size() && tokens.get(tokensPassed +
// nlOffset).word().equals("*NL*")){
// nlOffset++;
// paragraph = true;
// }
// documentSections.get(sectionNumber).add(sentence);
// if (paragraph && documentSections.get(sectionNumber).size() > 0)
// documentSections.put(++sectionNumber, new LinkedList<CoreMap>());
// paragraph = false;
// }
// return documentSections;
// }
@Override
protected boolean mightBeSectionHeader(String line) {
return false;
}
// public static void main(String[] args) {
// KpeReader epReader = new EpinionReader();
// epReader.initGrammar("tokenize, ssplit, cleanxml, pos, lemma, ner");
// List<DocumentData> docs =
// epReader.getContent("corpora/phones/Pantech_breEZeTM_C520_Cellular_Phone_reviews.xml");
// for (DocumentData dd : docs) {
// // // if (dd.getContent().contains("March, I switched my"))
// }
// }
@Override
public String getText(String file, int numberWithinFile) {
// if (fileType.equals(".txt") && !file.contains("Readme")){
// Collection<String> lines = NLPUtils.readDocToCollection(file, new LinkedList<String>());
// boolean beginningMet = false, title = false;
// StringBuffer sentences = new StringBuffer();
// for (String line : lines){
// title = false;
// if (line.startsWith("[t]")){
// beginningMet = true;
// title = true;
// if (sentences.length() > 0){
// toReturn.add(new DocumentData(sentences, "keyphrases", file, toReturn.size(),
// this.getClass()));
// }
// sentences = new StringBuffer();
// }
// if (beginningMet){
// sentences.append(line.substring(line.indexOf(title ? "[t]" : "##") + (title ? 3 : 2)).trim()
// + "\r\n");
// }
// }
// toReturn.add(new DocumentData(sentences, "keyphrases", file, toReturn.size(),
// this.getClass()));
// return toReturn;
// }
StringBuffer review = new StringBuffer();
try (BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(file), getEncoding()))) {
int reviewNumberInFile = 0;
boolean needed = false;
String line;
while ((line = transformLine(br.readLine())) != null) {
if (line.startsWith("<review urlID=")) {
++reviewNumberInFile;
}
if (reviewNumberInFile == numberWithinFile && (line.equals("<title>") || line.equals("<bottomLine>"))) {
String nextLine = transformLine(br.readLine());
if (nextLine != null) {
review.append(nextLine + "\r\n");
}
} else if (reviewNumberInFile == numberWithinFile && line.equals("</attributes>")) {
needed = true;
} else if (needed && line.equals("</review>")) {
return review.toString();
} else if (needed) {
review.append(line + "\r\n");
}
}
} catch (IOException io) {
io.printStackTrace();
}
return review.toString();
}
}