package net.krautchan.parser; /* * Copyright (C) 2011 Johannes Jander (johannes@jandermail.de) * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.Reader; import net.krautchan.data.KCPosting; import net.krautchan.data.KODataListener; public class KCPostingStreamParser implements KCStreamParser<KCPosting> { private String resolverPath = null; private KODataListener<KCPosting> handler = null; private Object token; private static final char[][] startTags = { "<input name=\"post_".toCharArray(), "<span class=\"postsubject\">".toCharArray(), "<span class=\"postername\">".toCharArray(), "<span class=\"postdate\">".toCharArray(), "<span class=\"postnumber\"".toCharArray(), "div class=\"file_reply\">".toCharArray(), "div class=\"file_thread\">".toCharArray(), "<p id=\"post_text_".toCharArray() }; private static final KCPosting.Fields[] fields = { KCPosting.Fields.KC_NUM, KCPosting.Fields.TITLE, KCPosting.Fields.USER, KCPosting.Fields.DATE, KCPosting.Fields.URI, KCPosting.Fields.IMAGES, KCPosting.Fields.IMAGES, KCPosting.Fields.CONTENT }; private static final char[][] endTags = { "\"".toCharArray(), "</span>".toCharArray(), "</span>".toCharArray(), "</span>".toCharArray(), "</span>".toCharArray(), "<blockquote>".toCharArray(), "<blockquote>".toCharArray(), "</blockquote>".toCharArray() }; /** * This method is probably a bit frightening for the reader - actually it is for me as the * inventor too. The idea is that all fields defined in the KCPosting Fields enum are matched * simulanously against the chars read from reader. * non-matching fields are not copied, but their startTags and endTags positions reset */ @Override public KCPosting parse(Reader reader) throws Exception { KCPosting post = new KCPosting (); int curChar = -1; int[] startPositions = new int[startTags.length]; for (int i = 0; i < startPositions.length; i++) { startPositions[i] = 0; } int[] endPositions = new int[startTags.length]; for (int i = 0; i < endPositions.length; i++) { endPositions[i] = 0; } curChar = reader.read(); while (-1 != curChar) { for (int curTag = 0; curTag < startPositions.length; curTag++) { if (curChar == startTags[curTag][startPositions[curTag]]) { startPositions[curTag]++; if (startPositions[curTag] == startTags[curTag].length) { StringBuffer buf = new StringBuffer (2000); startPositions[curTag] = 0; endPositions[curTag] = 0; while ((-1 != curChar) && (endPositions[curTag] != endTags[curTag].length)) { curChar = reader.read(); buf.append((char)curChar); if (curChar == endTags[curTag][endPositions[curTag]]) { endPositions[curTag]++; } else { endPositions[curTag] = 0; } } buf.setLength(buf.length()-endTags[curTag].length); post.setField (fields[curTag], buf.toString()); if (curTag == startTags.length-1) { post.dbId = (long)(resolverPath+post.kcNummer).hashCode(); if (null != handler) { handler.notifyAdded(post, token); } return post; } startPositions[curTag] = 0; } } else if (startPositions[curTag] != 0){ startPositions[curTag] = 0; } } curChar = reader.read(); } if (null != handler) { handler.notifyDone(token); } return null; } @Override public void setHandler(KODataListener<KCPosting> handler, Object token) { this.handler = handler; this.token = token; } @Override public char[] getFilterMarker() { return "<div class=\"postheader\">".toCharArray(); } public void setBasePath(String resolverPath) { this.resolverPath = resolverPath; } @Override public void notifyDone() { handler.notifyDone(token); } }