package project.persistence.builder.impl; import java.net.MalformedURLException; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.ResultSet; import java.sql.SQLException; import java.util.LinkedList; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpHost; import org.apache.http.HttpRequest; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.util.EntityUtils; import project.client.persistence.MessageBoard; import project.client.persistence.MessageThread; import project.core.mbeans.crawlers.CrawlerData; import project.persistence.builder.MessageBoardCrawler; public class GoogleGroupsMessageBoardCrawler extends BaseHttpClient implements MessageBoardCrawler { private static final String STR_DATABASE = "bachelor_project"; private static final String STR_USERNAME = "ebas"; private static final String STR_PASSWORD = "gwtebas"; private Connection connection = null; static { try { Class.forName("com.mysql.jdbc.Driver").newInstance(); } catch (Exception e) { e.printStackTrace(); } } private MessageBoard board = null; private int pageIdx = -1; private int maxPageIdx = -1; private int numMessagesPerPage = -1; private CrawlerData data = null; private static final String THREAD_REGEX_0 = "<a href=\"/group/"; private static final String THREAD_REGEX_1 = "/browse_thread/thread/([a-z0-9]*)#\"><font size=\"..\">(.*)</font></a>"; private static final String PAGE_IDX_REGEX = "<b>([0-9]*)</b> of <b>([0-9]*)</b>"; public GoogleGroupsMessageBoardCrawler () { this.numMessagesPerPage = 10; System.out.println ("Warning! The default (no parameter) constructor of " + this.getClass().getName() + " is meant to be used only by the MBean crawler. Do not use it directly, as it doesn't init anything!!"); } public GoogleGroupsMessageBoardCrawler (Object[] params) throws MalformedURLException { this.numMessagesPerPage = 10; this.initCrawler(params); } public List<MessageThread> extractMessageThreads() { List<MessageThread> threads = new LinkedList<MessageThread> (); //session.beginTransaction(); try { HttpHost target = new HttpHost(this.board.getUrl(), 80, "http"); HttpClient client = createHttpClient(); HttpRequest req = createRequest("/group/" + this.board.getName() + "/topics?start=" + (this.numMessagesPerPage * this.pageIdx) + "&sa=N"); //System.out.println("executing request to " + target + ": " + req.getRequestLine().getUri()); HttpEntity entity = null; try { HttpResponse rsp = client.execute(target, req); entity = rsp.getEntity(); if (entity != null) { String content = EntityUtils.toString(entity); if (content.indexOf(" To protect our users, we can't process your request") != -1) { System.out.println ("Your crawler was banned!!!"); } else { Pattern pattern = Pattern.compile(THREAD_REGEX_0 + data.getUrl() + THREAD_REGEX_1); Matcher matcher = pattern.matcher(content); while (matcher.find()) { MessageThread msgThread = new MessageThread (); msgThread.setMessageBoard(board); msgThread.setName(matcher.group(2)); msgThread.setUrl(matcher.group(1)); threads.add(msgThread); } //System.out.println ("Found " + threads.size() + " threads"); pattern = Pattern.compile(PAGE_IDX_REGEX); matcher = pattern.matcher(content); while (matcher.find()) { if (matcher.groupCount() == 2) { this.maxPageIdx = Integer.parseInt(matcher.group(2)) / this.numMessagesPerPage; } else { //System.out.println ("Invalid match on max page index : " + matcher.group()); TODO log } } System.out.println ("Indexed page = " + this.pageIdx + "(start = " + data.getStartPage() + ",end = " + data.getEndPage() + ",max found = " + this.maxPageIdx + ")"); } } } finally { // If we could be sure that the stream of the entity has been // closed, we wouldn't need this code to release the connection. // However, EntityUtils.toString(...) can throw an exception. // if there is no entity, the connection is already released if (entity != null) entity.consumeContent(); // release connection gracefully } } catch (Exception e) { e.printStackTrace(); } //session.getTransaction().commit(); return threads; } public int getNumPages() { if (this.maxPageIdx == -1) { // // it hasn't been initialized, so update it now // (just make a request for any page) this.maxPageIdx = 0; this.pageIdx = 0; this.extractMessageThreads(); } return this.maxPageIdx; } public int getNumThreads() { return this.getNumPages() * this.numMessagesPerPage; } public int getPage() { return this.pageIdx; } public void initCrawler(Object[] params) throws IllegalArgumentException { try { connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/" + STR_DATABASE, STR_USERNAME, STR_PASSWORD); } catch (SQLException e) { e.printStackTrace(); } if (params.length < 1 || params.length > 2) { throw new IllegalArgumentException ("Invalid number of params (expected 1, found " + params.length + ")"); } Class<?>[] classes = new Class<?>[] { CrawlerData.class }; for (int i = 0; i < params.length; i++) { if (!params [i].getClass().equals(classes [i])) { throw new IllegalArgumentException ("Invalid argument (expected class " + classes [i].getSimpleName() + ", found " + params [i].getClass().getSimpleName() + ")"); } } try { this.data = (CrawlerData) params [0]; String query = "select * from MessageBoard where name like ?;"; PreparedStatement statement = connection.prepareStatement(query); statement.setString(1, data.getUrl()); ResultSet set = statement.executeQuery(); if (!set.next()) { System.out.println ("Insert a new messsageBoard: " + data.getUrl()); query = "insert into MessageBoard(name,url) values(?,?)"; set.close(); statement.close(); statement = connection.prepareStatement(query, PreparedStatement.RETURN_GENERATED_KEYS); statement.setString(1, data.getUrl()); statement.setString(2, data.getSettings().getUrl()); statement.execute(); set = statement.getGeneratedKeys(); set.next(); this.board = new MessageBoard (); board.setId(set.getInt(1)); board.setName(data.getUrl()); board.setDescription(null); board.setUrl(data.getSettings().getUrl()); set.close(); statement.close(); } else { this.board = new MessageBoard (); board.setId(set.getInt(1)); board.setName(set.getString(2)); board.setDescription(set.getString(3)); board.setUrl(set.getString(4)); set.close(); statement.close(); } } catch (Exception e) { e.printStackTrace(); } this.setPage(data.getCurrentPage()); } public boolean setPage(int index) { if (this.getNumPages() != -1) { if (index < 0 || index > (this.getNumPages() - 1)) return false; } else { pageIdx = maxPageIdx = 0; extractMessageThreads (); } this.pageIdx = index; return true; } public boolean hasNext() { return this.pageIdx < data.getEndPage(); } public MessageBoardCrawler next() { if (!this.setPage(this.getPage() + 1)) return null; return this; } public void remove() { /* Not implemented, it does nothing (inherited from Iterator<E>, it should safely remove the current object pointed by iterator) */ } public MessageBoard getMessageBoard() { return this.board; } }