package com.mboarder.data; /* * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ import java.io.InputStreamReader; import java.net.URL; import java.net.URLConnection; import java.util.ArrayList; import java.util.List; import org.htmlcleaner.HtmlCleaner; import org.htmlcleaner.TagNode; import android.util.Log; import com.mboarder.bean.Topic; import com.mboarder.bean.TopicPost; import com.mboarder.string.TextViewString; /** * * Util file to fetch the webpages. * @author Luyi Wang * */ public class FetchWebpage { private static final String TAG = "FetchWebPage"; private static final String baseURL ="http://www.mitbbs.com"; private static final String moibleBaseURL ="http://www.mitbbs.com/mobile/"; private HtmlCleaner htmlCleaner ; public HtmlCleaner getHtmlCleaner() { return htmlCleaner; } public void setHtmlCleaner(HtmlCleaner htmlCleaner) { this.htmlCleaner = htmlCleaner; } private ArrayList<String> messageStringArray; private URL mitbbspageURL; public FetchWebpage(String mitbbspageAddress) throws Exception { this(new URL(mitbbspageAddress),new ArrayList<String>(),new HtmlCleaner()); } public FetchWebpage(URL mitbbspageURL) { this(mitbbspageURL,new ArrayList<String>(),new HtmlCleaner()); } public FetchWebpage(URL mitbbspageURL,ArrayList<String> messageStringArray, HtmlCleaner htmlCleaner) { this.mitbbspageURL = mitbbspageURL; this.messageStringArray = messageStringArray; this.htmlCleaner = htmlCleaner; } public ArrayList<String> getMessageStringArray() { return messageStringArray; } public void setMessageStringArray(ArrayList<String> messageStringArray) { this.messageStringArray = messageStringArray; } public URL getMitbbspageURL() { return mitbbspageURL; } public void setMitbbspageURL(URL mitbbspageURL) { this.mitbbspageURL = mitbbspageURL; } public ArrayList<URL> getBoardLinks(String mitbbspageAddress) throws Exception { return this.getBoardLinks(new URL(mitbbspageAddress)); } public ArrayList<URL> getBoardLinks(URL mitbbspageURL) { return this.getBoardLinks(htmlCleaner,mitbbspageURL,"GB2312","sy_biaoti"); } /** * get previous page and next page for board, notice the parse get the second tag by (TagNode)nodes.get(1); * @param htmlCleaner * @param mitbbspageURL * @param encoding * @param tagName * @return */ public ArrayList<URL> getBoardLinks(HtmlCleaner htmlCleaner, URL mitbbspageURL,String encoding, String tagName) { Log.i(TAG,"getBoardLinks"); ArrayList<URL> links = new ArrayList<URL>(); try{ URLConnection conn = mitbbspageURL.openConnection(); TagNode node = htmlCleaner.clean(new InputStreamReader(conn .getInputStream(), encoding)); List nodes = node.getElementListByAttValue("id",tagName,true,true); Log.i(TAG, "getBoardLinks nodes.length=" + String.valueOf(nodes.size())); TagNode pnode = (TagNode)nodes.get(1); List linklist = pnode.getElementListByName("a",true); for(int i=0;i<linklist.size();i++) { TagNode pre = (TagNode)linklist.get(i); links.add(new URL(moibleBaseURL+pre.getAttributeByName("href"))); } if(links.size()<4 && links.size()==2) { URL obj = links.get(0); URL obj1 = links.get(1); links.set(0, null); links.add(1, null); links.add(2,obj); links.add(3,obj1); } }catch(Exception ex) { Log.i(TAG,"getBoardLinks"+ex.toString()); } return links; } public ArrayList<URL> getPageLinks() { return this.getPageLinks(this.getMitbbspageURL()); } public ArrayList<URL> getPageLinks(String mitbbspageAddr) throws Exception { return this.getPageLinks(htmlCleaner, new URL(mitbbspageAddr), "GB2312", "sy_biaoti"); } public ArrayList<URL> getPageLinks(URL mitbbspageURL) { return this.getPageLinks(htmlCleaner, mitbbspageURL, "GB2312", "sy_biaoti"); } /** * get page links, notice the parse get the first tag by (TagNode)nodes.get(0); * @param htmlCleaner * @param mitbbspageURL * @param encoding * @param tagName * @return */ public ArrayList<URL> getPageLinks(HtmlCleaner htmlCleaner, URL mitbbspageURL,String encoding, String tagName) { Log.i(TAG,"getPageLinks"); ArrayList<URL> links = new ArrayList<URL>(); try{ URLConnection conn = mitbbspageURL.openConnection(); TagNode node = htmlCleaner.clean(new InputStreamReader(conn .getInputStream(), encoding)); List nodes = node.getElementListByAttValue("id",tagName,true,true); Log.i(TAG, "nodes.length=" + String.valueOf(nodes.size())); TagNode pnode = (TagNode)nodes.get(0); List linklist = pnode.getElementListByName("a",true); Log.i(TAG,""+linklist.size()); if(linklist.isEmpty()) { links.add(0,mitbbspageURL); links.add(1,mitbbspageURL); return links; } boolean previous = false; boolean next = false; for(int i=0;i<linklist.size();i++) { TagNode pre =(TagNode)linklist.get(i); Log.i(TAG,pre.getText().toString()); if((pre.getText().toString().equals("\u4E0A\u9875"))||((pre.getText().toString().equals("\u540C\u4E3B\u9898\u4E0A\u7BC7")))) { previous = true; links.add(new URL(moibleBaseURL+pre.getAttributeByName("href"))); } if((pre.getText().toString().equals("\u4E0B\u9875"))||((pre.getText().toString().equals("\u540C\u4E3B\u9898\u4E0A\u7BC7")))) { next = true; links.add(new URL(moibleBaseURL+pre.getAttributeByName("href"))); } } if(!previous) { links.add(1,links.get(0)); links.set(0, mitbbspageURL); } if(!next) { links.add(1,mitbbspageURL); } Log.i(TAG,"return links length is "+links.size()); }catch(Exception ex) { Log.i(TAG,"getPageLinks"+ex.toString()); } return links; } public ArrayList<Topic> parseMITBBSBoard() { return this.parseMITBBSBoard(this.htmlCleaner,this.getMitbbspageURL(),"GB2312","li"); } public ArrayList<Topic> parseMITBBSBoard(String mitbbsPageAddress)throws Exception { return this.parseMITBBSBoard(this.htmlCleaner,new URL(mitbbsPageAddress),"GB2312","li"); } /** * parsed board topics * @param htmlCleaner * @param mitbbspageURL * @param encoding should "gb2312" * @param tagName should be "li" * @return * Topic List */ public ArrayList<Topic> parseMITBBSBoard(HtmlCleaner htmlCleaner, URL mitbbspageURL,String encoding, String tagName) { Log.i(TAG,"parseMITBBSBoard()"); ArrayList<Topic> parsedStringList = new ArrayList<Topic>(); try { URLConnection conn = mitbbspageURL.openConnection(); TagNode node = htmlCleaner.clean(new InputStreamReader(conn .getInputStream(), encoding)); List nodes = node.getElementListByName(tagName, true); Log.i(TAG, "nodes.length=" + String.valueOf(nodes.size())); String title ; String user; String address; String date; String parsedText; TagNode pnode; for (int i = 0; i < nodes.size(); i++) { TagNode tnode = (TagNode)nodes.get(i); parsedText=tnode.getText().toString(); title = parsedText.substring(parsedText.contains(" ")? parsedText.lastIndexOf(" ")+7: parsedText.indexOf("]")+1, parsedText.indexOf("(")); Log.i(TAG,title); user = parsedText.substring(parsedText.indexOf("\u4F5C\u8005")+3,parsedText.indexOf("\u53D1\u8868\u65F6\u95F4")); Log.i(TAG,user); date = parsedText.substring(parsedText.indexOf("\u53D1\u8868\u65F6\u95F4")+5,parsedText.length()-1); Log.i(TAG,date); pnode =tnode.findElementByName("a", true); address = moibleBaseURL+pnode.getAttributeByName("href"); Log.i(TAG,address); parsedStringList.add(new Topic(TextViewString.RemoveHtmlMarker(title), "\u4F5C\u8005: "+TextViewString.RemoveHtmlMarker(user), address, "\u65F6\u95F4: "+TextViewString.RemoveHtmlMarker(date))); } } catch (Exception ex) { Log.i(TAG, "parseMITBBSBoard"+ex.toString()); } //parsedStringList.add("done"); Log.i(TAG,String.valueOf(parsedStringList.size())); return parsedStringList; } public ArrayList<TopicPost> parsePage(String url) throws Exception { Log.i(TAG,"parsePage"); if(url.contains("marticle.php")) return parseTopTopic(url); else if(url.contains("marticle_t.php")) return parseMITBBSBoardTopic(url); return null; } public ArrayList<TopicPost> parseTopTopic() { return this.parseMITBBSBoardTopic(this.getHtmlCleaner(), mitbbspageURL, "GB2312", "wenzhangyudu"); } public ArrayList<TopicPost> parseTopTopic(String TopTopicPageAddress) { return this.parseTopTopic(this.htmlCleaner,this.getMitbbspageURL(),"GB2312","wenzhangyudu"); } public ArrayList<TopicPost> parseTopTopic(HtmlCleaner htmlCleaner, URL mitbbspageURL,String encoding, String tagName){ Log.i(TAG,"parseTopTopic"); ArrayList<TopicPost> parsedStringList = new ArrayList<TopicPost>(); try { URLConnection conn = mitbbspageURL.openConnection(); TagNode node = htmlCleaner.clean(new InputStreamReader(conn .getInputStream(), encoding)); List nodes = node.getElementListByAttValue("id",tagName,true, true); Log.i(TAG, "nodes.length=" + String.valueOf(nodes.size())); String user; String date; String msg; String parsedText; for (int i = 0; i < nodes.size(); i++) { TagNode tnode = (TagNode)nodes.get(i); parsedText=tnode.getText().toString(); String lines[] = parsedText.split("\n"); user = lines[1].substring(lines[1].indexOf("\u53D1\u4FE1\u4EBA")+5, lines[1].indexOf(",")); date = lines[3].substring(lines[3].lastIndexOf("(")+1, lines[3].lastIndexOf(")")); StringBuilder sb = new StringBuilder(); String temp; for(int k=4;k<lines.length-2;k++) { temp = TextViewString.RemoveHtmlMarker(lines[k]); sb.append(temp); sb.append("\n"); } msg = sb.toString(); parsedStringList.add(new TopicPost("\u4F5C\u8005: "+TextViewString.RemoveHtmlMarker(user), "\u65F6\u95F4: "+TextViewString.RemoveHtmlMarker(date), msg)); } } catch (Exception ex) { Log.i(TAG, "parseTopTopic"+ex.toString()); } //parsedStringList.add("done"); return parsedStringList; } public ArrayList<TopicPost> parseMITBBSBoardTopic() { return this.parseMITBBSBoardTopic(this.htmlCleaner,this.getMitbbspageURL(),"GB2312","li"); } public ArrayList<TopicPost> parseMITBBSBoardTopic(String mitbbsPageAddress)throws Exception { return this.parseMITBBSBoardTopic(this.htmlCleaner,new URL(mitbbsPageAddress),"GB2312","li"); } public ArrayList<TopicPost> parseMITBBSBoardTopic(HtmlCleaner htmlCleaner, URL mitbbspageURL,String encoding, String tagName) { Log.i(TAG,"parseMITBBSBoardTopic"); ArrayList<TopicPost> parsedStringList = new ArrayList<TopicPost>(); try { URLConnection conn = mitbbspageURL.openConnection(); TagNode node = htmlCleaner.clean(new InputStreamReader(conn .getInputStream(), encoding)); List nodes = node.getElementListByName(tagName, true); Log.i(TAG, "nodes.length=" + String.valueOf(nodes.size())); String user; String date; String msg; String parsedText; for (int i = 0; i < nodes.size(); i++) { TagNode tnode = (TagNode)nodes.get(i); parsedText=tnode.getText().toString(); String lines[] = parsedText.split("\n"); user = lines[0].substring(lines[0].indexOf("\u53D1\u4FE1\u4EBA")+5, lines[0].indexOf(",")); date = lines[2].substring(lines[2].lastIndexOf("(")+1, lines[2].lastIndexOf(")")); StringBuilder sb = new StringBuilder(); String temp; for(int k=3;k<lines.length-2;k++) { temp = TextViewString.RemoveHtmlMarker(lines[k]); sb.append(temp); sb.append("\n"); } msg = sb.toString(); parsedStringList.add(new TopicPost("\u4F5C\u8005: "+TextViewString.RemoveHtmlMarker(user), "\u65F6\u95F4: "+TextViewString.RemoveHtmlMarker(date), msg)); } } catch (Exception ex) { Log.i(TAG, "parseMITBBSBoardTopic"+ex.toString()); } return parsedStringList; } }