package project.persistence.builder.impl;
import java.util.Iterator;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpRequest;
import org.apache.http.HttpResponse;
import org.apache.http.client.HttpClient;
import org.apache.http.util.EntityUtils;
import org.htmlparser.Parser;
import project.client.persistence.Message;
import project.client.persistence.MessageThread;
import project.persistence.builder.MessageThreadCrawler;
public class GoogleGroupsThreadCrawler extends BaseHttpClient implements MessageThreadCrawler {
private MessageThread messageThread;
private boolean read;
public GoogleGroupsThreadCrawler () {
//
// do nothing, used by
this.read = false;
System.out.println ("Warning! The default (no parameter) constructor of " +
this.getClass().getName() + " is meant to be used only by the MBean crawler. Do not use it directly, as it doesn't init anything!!");
}
public GoogleGroupsThreadCrawler (Object[] params) {
this.initCrawler(params);
}
public List<Message> extractMessages() {
List<Message> msgs = null;
try {
HttpHost target = new HttpHost(this.messageThread.getMessageBoard().getUrl(), 80, "http");
HttpClient client = createHttpClient();
HttpRequest req = createRequest("/group/" + this.messageThread.getMessageBoard().getName() +
"/browse_thread/thread/" + this.messageThread.getUrl());
//System.out.println("executing request to " + target + ": " + req.getRequestLine().getUri());
HttpEntity entity = null;
try {
HttpResponse rsp = client.execute(target, req);
entity = rsp.getEntity();
if (entity != null) {
String content = EntityUtils.toString(entity);
content = content.replaceAll("<br>", "\n");
content = content.replaceAll("<", "<");
content = content.replaceAll(">", ">");
content = content.replaceAll(" ", " ");
content = content.replaceAll("<p>", "\n\n");
content = content.replaceAll(""", "\"");
Parser parser = new Parser (content);
MyVisitor v = new MyVisitor ();
parser.visitAllNodesWith(v);
msgs = v.getMessages();
/*
Iterator<Message> i = msgs.iterator();
while (i.hasNext()) {
Message msg = i.next();
System.out.println ("--------------------------------------------------");
System.out.println ("Author: " + msg.getUser().getName());
System.out.println ("Content: " + msg.getContent());
}
*/
content = null;
}
} finally {
// If we could be sure that the stream of the entity has been
// closed, we wouldn't need this code to release the connection.
// However, EntityUtils.toString(...) can throw an exception.
// if there is no entity, the connection is already released
if (entity != null)
entity.consumeContent(); // release connection gracefully
}
} catch (Exception e) {
e.printStackTrace();
}
return msgs;
}
public void initCrawler(Object[] params) throws IllegalArgumentException {
if (params == null)
throw new IllegalArgumentException ("init params are null");
if (params.length != 1)
throw new IllegalArgumentException ("invalid number of params (expected 1, found " + params.length + ")");
Class<?>[] classes = new Class<?>[] {
MessageThread.class
};
for (int i = 0; i < params.length; i++) {
if (!params [i].getClass().equals(classes [i])) {
throw new IllegalArgumentException ("Invalid argument (expected class " +
classes [i].getSimpleName() + ", found " +
params [i].getClass().getSimpleName() + ")");
}
}
this.messageThread = (MessageThread) params [0];
read = false;
}
public boolean hasNext() {
return (!read);
}
public MessageThreadCrawler next() {
this.read = true;
return this;
}
public void remove() {
/* Not implemented */
}
}