package project.persistence.builder.impl;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.htmlparser.Tag;
import org.htmlparser.Text;
import org.htmlparser.visitors.NodeVisitor;
import project.client.persistence.Message;
import project.client.persistence.User;
public class MyVisitor extends NodeVisitor
{
private boolean bInterested = false;
private String author = null;
private String date = null;
private String msgId = null;
private Message parent = null;
private List<Message> messages;
private Tag endTag = null;
private Tag parentEndTag = null;
//
// used internally to store intermediate results
//
private List<String> foundSoFar;
private static Pattern pattern = null;
static {
/*
On May 22, 5:03 am, abhiram
@gmail.com> wrote:
*/
String p = "On [a-zA-Z0-9:,@> \n\r\t/]*";
pattern = Pattern.compile(p);
}
public MyVisitor ()
{
messages = new LinkedList<Message> ();
foundSoFar = new LinkedList<String> ();
}
public void visitTag (Tag tag)
{
/*
if (tag.getText().contains("?hide_quotes=no#msg_")) {
if (parentId == null) {
parentId = tag.getText();
parentId = parentId.substring(parentId.indexOf("?hide_quotes=no#msg_") + "?hide_quotes=no#msg_".length());
parentId = parentId.substring(0, parentId.indexOf("\""));
}
//System.out.println ("Parent id: " + parentId);
}
*/
if (tag.getTagName().equalsIgnoreCase("input")) {
if (tag.getText().startsWith("input id=\"hdn_author\"")) {
try {
author = tag.getText();
author = author.substring(author.indexOf("value=\"") + "value=\"".length());
author = author.substring(0, author.indexOf("<"));
author = author.trim();
//System.out.println ("Author: " + author);
} catch (StringIndexOutOfBoundsException e) {
// TODO ignore this, the author is not valid
}
foundSoFar.clear();
} else if (tag.getText().startsWith("input id=\"hdn_date\"")) {
try {
date = tag.getText();
date = date.substring(date.indexOf("value=\"") + "value=\"".length());
date = date.substring(0, date.length() - 1);
//System.out.println ("Date: " + date);
} catch (StringIndexOutOfBoundsException e) {
// TODO ignore this, the date is not valid (code was unable to find it)
}
}
} else if (tag.getTagName().equalsIgnoreCase("div")) {
//System.out.println (tag.getText());
if (tag.getText().startsWith("div class=\"msg wdth100\"")) {
// now we get the id of message
try {
msgId = tag.getText();
msgId = msgId.substring(msgId.indexOf("id=\"") + "id=\"".length());
msgId = msgId.substring(0, msgId.indexOf("\""));
msgId = msgId.substring(4);
//System.out.println ("msg id: " + msgId);
} catch (StringIndexOutOfBoundsException e) {
// TODO ignore this
}
}
else if (tag.getText().startsWith("div ID=qhide_") && parentEndTag == null) {
parentEndTag = tag;
}
}
if (tag.getText().matches("div id=\"inbdy\"")) {
bInterested = true;
//System.out.println ("------------------------------------------------------------------------------");
//System.out.println ("Author: " + author);
foundSoFar.clear();
endTag = tag.getEndTag();
}
}
public void visitStringNode (Text string)
{
//System.out.println (string.getText());
if (parentEndTag != null && parent == null) {
String text = string.toPlainTextString();
text = text.trim();
if (text.indexOf("wrote:") != -1) {
//System.out.println ("Candidate n: " + text);
text = text.substring(text.indexOf("wrote:") + "wrote:".length());
//System.out.println ("Candidate n': " + text);
text = text.trim();
if (text.indexOf('>') != -1) {
text = text.substring(text.indexOf('>') + 1);
if (text.indexOf('>') != -1) {
text = text.substring(0, text.indexOf('>'));
}
}
}
text = text.trim();
if (text.indexOf('>') != -1) {
text = text.substring(text.indexOf('>') + 1);
if (text.indexOf('>') != -1) {
text = text.substring(0, text.indexOf('>'));
}
}
text = text.trim();
parent = this.findParent(text);
}
if (bInterested) {
//System.out.println (string.getText());
String txt = string.getText();
//txt = txt.replaceAll(">", ">");
//txt = txt.replaceAll("<", "<");
//txt = txt.replaceAll(""", "\"");
//txt = txt.replaceAll("'", "'");
//txt = txt.replaceAll("&", "`");
//txt = txt.replaceAll("
", "");
if (txt.length() > 3) {
//foundSoFar.add(txt);
//System.out.println ("------------------------------------");
String[] tokens = txt.split("\n");
String fmt = "";
for (String t : tokens) {
t = t.trim();
if (!t.startsWith(">")) {
fmt += t + "\n";
} else {
//
}
}
fmt.replaceAll("- Hide quoted text -", "");
foundSoFar.add(fmt);
}
}
}
public void visitEndTag(Tag tag) {
//String html1 = tag.getParent().toHtml();
//System.out.println (html1);
if (tag.equals(parentEndTag)) {
// finished parent
parentEndTag = null;
}
if (tag.equals(endTag)) {
bInterested = false;
Iterator<String> i = foundSoFar.iterator();
String strMsg = "";
while (i.hasNext()) {
strMsg += i.next();
}
if (strMsg.indexOf("- Hide quoted text -") != -1) {
strMsg = strMsg.substring(0, strMsg.indexOf("- Hide quoted text -"));
}
Matcher m = pattern.matcher(strMsg);
if (m.find()) {
String str = m.group();
strMsg = strMsg.substring(0, strMsg.indexOf(str));
//System.out.println (m.group());
}
//else
//System.out.println ("No match found");
//
// create a new owner
User owner = new User ();
owner.setName(author);
Message msg = new Message ();
msg.setContent(strMsg);
msg.setUser(owner);
msg.setPublishDate(date);
msg.setUrl(msgId);
//if (parent != null)
msg.setParent(parent);
//else if (messages.size() != 0)
// msg.setParent(messages.get(0));
/*
System.out.println ("Author : " + msg.getUser().getName());
System.out.println ("Date : " + msg.getPublishDate());
System.out.println ("Id : " + msg.getUrl());
if (msg.getParent() != null)
System.out.println ("Parent : " + msg.getParent().getUrl() + "(" + getMessageIdx(msg.getParent().getUrl()) + ")");
else
System.out.println ("Parent : " + "no parent");
System.out.println ("Content : \n" + msg.getContent());
System.out.println ("--------------------------------------------------");
*/
//
//
msgId = null;
endTag = null;
parentEndTag = null;
parent = null;
//
// add the new message
messages.add(msg);
}
}
public List<Message> getMessages() {
return messages;
}
private Message findParent (String text) {
if (text == null || messages.size() == 0) {
return null;
}
if (text.length() == 0)
return null;
//System.out.println ("Finding parent with text : " + text + "(#messages = " + messages.size() + ")");
Iterator<Message> i = messages.iterator();
int idx = 0;
while (i.hasNext()) {
Message m = i.next();
if (m.getContent().indexOf(text) != -1) {
//System.out.println ("Found parent : " + idx);
return m;
}
idx ++;
}
//System.out.println ("No parent found!");
return null;
}
private int getMessageIdx (String url) {
Iterator<Message> i = messages.iterator();
int idx = 0;
while (i.hasNext()) {
if (i.next().getUrl().equals(url))
return idx;
idx ++;
}
return -1;
}
}