package net.krautchan.parser; /* * Copyright (C) 2011 Johannes Jander (johannes@jandermail.de) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.BufferedReader; import java.io.InputStreamReader; import java.io.Reader; import java.util.ArrayList; import java.util.List; import org.apache.http.HttpResponse; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import net.krautchan.data.KCPosting; import net.krautchan.data.KCThread; import net.krautchan.data.KODataListener; public class KCPageParser implements Runnable { private HttpClient client; private long boardDbId; private Object token; private String resolverPath = null; private KCPostingStreamParser pParser = null; private KCThreadStreamParser tParser = null; private KODataListener<KCThread> threadHandler = null; private String url = null; private KCThread thread; public KCPageParser(KCThread thread) { this.thread = thread; this.url = thread.uri; this.token = thread.uri; this.boardDbId = thread.board_id; pParser = new KCPostingStreamParser(); tParser = new KCThreadStreamParser(); tParser.setBoardId(boardDbId); tParser.setPostingParser(pParser); } public KCPageParser(String url, long boardDbId) { this.url = url; this.token = url; this.boardDbId = boardDbId; pParser = new KCPostingStreamParser(); tParser = new KCThreadStreamParser(); tParser.setBoardId(boardDbId); tParser.setPostingParser(pParser); } public List<KCThread> filterThreads (Reader reader, KCThreadStreamParser parser) throws Exception { parser.setBasePath(resolverPath); List<KCThread> threads = new ArrayList<KCThread>(); char[]filter = parser.getFilterMarker(); ThreadState state = new ThreadState (); int curChar; int pos = 0; curChar = reader.read(); while (-1 != curChar) { if ((state.curState == StateEnum.START) || (state.curState == StateEnum.READ_THREAD)) { if (curChar == filter[pos]) { pos++; if (pos == filter.length) { KCThread t = parser.parse(reader); t.board_id = boardDbId; threads.add(t); state.curState = StateEnum.START; pos = 0; } } else { pos = 0; } } curChar = reader.read(); } parser.notifyDone(); return threads; } public List<KCThread> filterThreads (Reader reader) throws Exception { return filterThreads(reader, tParser); } public KCPageParser setThreadHandler(KODataListener<KCThread> handler) { threadHandler = handler; tParser.setHandler(threadHandler, token); return this; } public KCPageParser setPostingHandler(KODataListener<KCPosting> postListener) { pParser.setHandler(postListener, token); return this; } @Override public void run() { if (null == threadHandler) { throw new IllegalArgumentException ("Cannot parse without a handler"); } if ((null == url) || (url.length() == 0)) { throw new IllegalArgumentException ("Cannot parse a NULL or empty url"); } tParser.setHandler(threadHandler, token); final char[]filter = tParser.getFilterMarker(); client = new DefaultHttpClient(); HttpGet request; if (url.startsWith("/")) { request = new HttpGet (resolverPath + url.substring(1)); } else { request = new HttpGet (url); } client.getParams().setParameter("Range", "bytes=42000-"); try { HttpResponse response = client.execute(request); BufferedReader reader = new BufferedReader (new InputStreamReader (response.getEntity().getContent())); ThreadState state = new ThreadState (); int curChar; int pos = 0; curChar = reader.read(); while (-1 != curChar) { if ((state.curState == StateEnum.START) || (state.curState == StateEnum.READ_THREAD)) { if (curChar == filter[pos]) { pos++; if (pos == filter.length) { if (null != thread) { tParser.parse(reader, thread); } else { tParser.parse(reader); } state.curState = StateEnum.START; pos = 0; } } else { pos = 0; } } curChar = reader.read(); } reader.close(); if (response.getEntity() != null ) { response.getEntity().consumeContent(); } tParser.notifyDone(); } catch (Exception e) { e.printStackTrace(); threadHandler.notifyError(e, token); } finally { client.getConnectionManager().shutdown(); // Close the instance here } } public KCPageParser setBasePath(String string) { resolverPath = string; return this; } //TODO decide: do we still need this? Wasn't the greatest idea ever. public enum StateEnum { START, START_THREAD, READ_THREAD, START_POST, READ_POST, END } //TODO decide: do we still need this? Wasn't the greatest idea ever. private static class ThreadState { public StateEnum curState = StateEnum.START; } }