package com.athena.asm.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.HttpVersion;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpHead;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.conn.params.ConnManagerParams;
import org.apache.http.conn.scheme.PlainSocketFactory;
import org.apache.http.conn.scheme.Scheme;
import org.apache.http.conn.scheme.SchemeRegistry;
import org.apache.http.entity.mime.MultipartEntity;
import org.apache.http.entity.mime.content.FileBody;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.params.BasicHttpParams;
import org.apache.http.params.HttpParams;
import org.apache.http.params.HttpProtocolParams;
import org.apache.http.util.EntityUtils;
import android.util.Log;
import com.athena.asm.data.Attachment;
import com.athena.asm.data.Post;
public class SmthCrawler {
public static String smthEncoding = "GBK";
public static String mobileSMTHEncoding = "UTF-8";
public static String userAgent = HttpClientHelper.USER_AGENT;
public static CookieStore smthCookie;
private int threadNum;
private ExecutorService execService;
private boolean destroy;
private static class Holder {
static SmthCrawler instance = new SmthCrawler();
}
private DefaultHttpClient httpClient;
public static SmthCrawler getIntance() {
return Holder.instance;
}
private SmthCrawler() {
init();
}
public static CookieStore getCookieStore(){
return smthCookie;
}
public void init() {
SchemeRegistry schemeRegistry = new SchemeRegistry();
schemeRegistry.register(new Scheme("http", PlainSocketFactory
.getSocketFactory(), 80));
HttpParams params = new BasicHttpParams();
ConnManagerParams.setMaxTotalConnections(params, 10);
HttpProtocolParams.setVersion(params, HttpVersion.HTTP_1_1);
ClientConnectionManager cm = new ThreadSafeClientConnManager(params,
schemeRegistry);
httpClient = new DefaultHttpClient(cm, params);
// 重试
// httpClient.setHttpRequestRetryHandler(new
// DefaultHttpRequestRetryHandler(3, false));
// 超时设置
// httpClient.getParams().setIntParameter(HttpConnectionParams.CONNECTION_TIMEOUT,
// 10000);
// httpClient.getParams().setIntParameter(HttpConnectionParams.SO_TIMEOUT,
// 10000);
httpClient.getParams()
.setParameter(ClientPNames.HANDLE_REDIRECTS, true);
threadNum = 10;
execService = Executors.newFixedThreadPool(threadNum);
destroy = false;
}
// return values:
// 1: success; 0: authentication failed; -1: connection failed
public int login(String userid, String passwd) {
String url = "http://www.newsmth.net/bbslogin.php";
HttpPost httpPost = new HttpPost(url);
List<NameValuePair> formparams = new ArrayList<NameValuePair>();
formparams.add(new BasicNameValuePair("id", userid));
formparams.add(new BasicNameValuePair("passwd", passwd));
UrlEncodedFormEntity entity;
try {
entity = new UrlEncodedFormEntity(formparams, "GBK");
} catch (UnsupportedEncodingException e1) {
return -1;
}
httpPost.setEntity(entity);
httpPost.setHeader("User-Agent", userAgent);
try {
HttpResponse response = httpClient.execute(httpPost);
HttpEntity e = response.getEntity();
String content = EntityUtils.toString(e, smthEncoding);
if (content.contains("你登录的窗口过多")) {
formparams.add(new BasicNameValuePair("kick_multi", "1"));
UrlEncodedFormEntity entity2;
entity2 = new UrlEncodedFormEntity(formparams, "GBK");
httpPost = new HttpPost(
"http://www.newsmth.net/bbslogin.php?mainurl=");
httpPost.setHeader("User-Agent", userAgent);
httpPost.setEntity(entity2);
httpClient.execute(httpPost);
} else if (content.contains("您的用户名并不存在,或者您的密码错误")) {
return 0;
} else if (content.contains("用户密码错误")) {
return 0;
}
// 保存cookie
smthCookie = httpClient.getCookieStore();
} catch (IOException e) {
e.printStackTrace();
return -1;
}
return 1;
}
public boolean uploadAttachFile(File file) {
HttpPost httpPost = new HttpPost(
"http://www.newsmth.net/bbsupload.php?act=add");
MultipartEntity entity = new MultipartEntity();
// entity.addPart(file.getName(), new FileBody(file));
try {
entity.addPart("attachfile0", new FileBody(file));
entity.addPart("counter", new StringBody("1"));
entity.addPart("MAX_FILE_SIZE", new StringBody("5242880"));
} catch (UnsupportedEncodingException e1) {
return false;
}
httpPost.setEntity(entity);
httpPost.setHeader("User-Agent", userAgent);
try {
HttpResponse response = httpClient.execute(httpPost);
HttpEntity e = response.getEntity();
String content = EntityUtils.toString(e, smthEncoding);
if (!content.contains("上传成功")) {
return false;
}
} catch (IOException e) {
return false;
}
return true;
}
public boolean sendPost(String postUrl, String postTitle,
String postContent, String signature, boolean isEdit) {
List<NameValuePair> formparams = new ArrayList<NameValuePair>();
formparams.add(new BasicNameValuePair("title", postTitle));
formparams.add(new BasicNameValuePair("text", postContent));
if (isEdit) {
postUrl += "&do";
}
else {
formparams.add(new BasicNameValuePair("signature", signature));
}
HttpPost httpPost = new HttpPost(postUrl);
UrlEncodedFormEntity entity;
try {
entity = new UrlEncodedFormEntity(formparams, "GBK");
} catch (UnsupportedEncodingException e1) {
return false;
}
httpPost.setEntity(entity);
httpPost.setHeader("User-Agent", userAgent);
try {
HttpResponse response = httpClient.execute(httpPost);
HttpEntity e = response.getEntity();
String content = EntityUtils.toString(e, smthEncoding);
if (!content.contains("成功")) {
return false;
}
} catch (IOException e) {
return false;
}
return true;
}
public boolean sendMail(String mailUrl, String mailTitle, String userid,
String num, String dir, String file, String signature,
String mailContent) {
HttpPost httpPost = new HttpPost(mailUrl);
List<NameValuePair> formparams = new ArrayList<NameValuePair>();
formparams.add(new BasicNameValuePair("title", mailTitle));
formparams.add(new BasicNameValuePair("userid", userid));
formparams.add(new BasicNameValuePair("num", num));
formparams.add(new BasicNameValuePair("dir", dir));
formparams.add(new BasicNameValuePair("file", file));
formparams.add(new BasicNameValuePair("signature", signature));
formparams.add(new BasicNameValuePair("backup", "1"));
formparams.add(new BasicNameValuePair("text", mailContent));
UrlEncodedFormEntity entity;
try {
entity = new UrlEncodedFormEntity(formparams, "GBK");
} catch (UnsupportedEncodingException e1) {
return false;
}
httpPost.setEntity(entity);
httpPost.setHeader("User-Agent", userAgent);
try {
HttpResponse response = httpClient.execute(httpPost);
HttpEntity e = response.getEntity();
String content = EntityUtils.toString(e, smthEncoding);
if (!content.contains("发送成功")) {
return false;
}
} catch (IOException e) {
return false;
}
return true;
}
/*
* public String getRedirectUrl(String url) {
* httpClient.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS,false);
* HttpGet httpget = new HttpGet(url); httpget.setHeader("User-Agent",
* userAgent); httpget.addHeader("Accept-Encoding", "gzip, deflate"); String
* newUrl; try { HttpResponse response = httpClient.execute(httpget); Header
* locationHeader = response.getLastHeader("Location"); newUrl =
* locationHeader.getValue(); } catch (IOException e) {
* Log.d("com.athena.asm", "get url failed,", e); newUrl = null; }
* httpClient.getParams().setParameter(ClientPNames.HANDLE_REDIRECTS,true);
* return newUrl; }
*/
public String getPostRequestResult(String url, List<NameValuePair> params) {
HttpPost httpPost = new HttpPost(url);
UrlEncodedFormEntity entity;
try {
entity = new UrlEncodedFormEntity(params, "GBK");
} catch (UnsupportedEncodingException e1) {
return null;
}
httpPost.setEntity(entity);
httpPost.setHeader("User-Agent", userAgent);
try {
HttpResponse response = httpClient.execute(httpPost);
HttpEntity e = response.getEntity();
String content = EntityUtils.toString(e, smthEncoding);
return content;
} catch (IOException e) {
return null;
}
}
public String fetchContent(String url, String encoding) {
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", userAgent);
httpget.addHeader("Accept-Encoding", "gzip, deflate");
if(smthCookie != null){
httpClient.setCookieStore(smthCookie);
}
String content;
try {
HttpResponse response = httpClient.execute(httpget);
HttpEntity entity = response.getEntity();
Header[] headers = response.getHeaders("Content-Encoding");
boolean isgzip = false;
if (headers != null && headers.length != 0) {
for (Header header : headers) {
String s = header.getValue();
if (s.contains("gzip")) {
isgzip = true;
}
}
}
if (isgzip) {
InputStream is = entity.getContent();
BufferedReader br = new java.io.BufferedReader(
new InputStreamReader(new GZIPInputStream(is), encoding));
String line;
StringBuilder sb = new StringBuilder();
while ((line = br.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
br.close();
content = sb.toString();
} else {
content = EntityUtils.toString(entity, encoding);
}
} catch (Exception e) {
Log.e("Crawler:fetchContent", "get url failed,", e);
content = null;
}
return content;
}
public String fetchAttachmentFilename(String url) {
HttpHead httphead = new HttpHead(url);
httphead.setHeader("User-Agent", userAgent);
String filename = "file.unknown";
try {
HttpResponse response = httpClient.execute(httphead);
Header[] headers = response.getHeaders("Content-Disposition");
if (headers != null && headers.length != 0) {
for (Header header : headers) {
String s = header.getValue();
if (s.contains("filename")) {
// how to decode filename from http header:
// http://stackoverflow.com/questions/93551/how-to-encode-the-filename-parameter-of-content-disposition-header-in-http
String rawValue = s.substring(s.lastIndexOf("=") + 1);
filename = rawValue;
// Log.d("fetchAttachmentFilename", url + filename);
return filename;
}
}
}
} catch (Exception e) {
Log.e("Crawler:fetchContent", "get url failed,", e);
}
return filename;
}
public String getUrlContentFromMobile(String url) {
return fetchContent(url, mobileSMTHEncoding);
}
public String getUrlContent(String url) {
return fetchContent(url, smthEncoding);
}
public void getPostList(List<Post> postList) {
if (postList == null)
return;
Pattern contentPattern = Pattern.compile("prints\\('(.*?)'\\);",
Pattern.DOTALL);
Pattern infoPattern = Pattern
.compile("conWriter\\(\\d+, '[^']+', \\d+, (\\d+), (\\d+), (\\d+), '[^']+', (\\d+), \\d+,'([^']+)'\\);");
List<Future<?>> futureList = new ArrayList<Future<?>>(postList.size());
for (Post post : postList) {
Future<?> future = execService.submit(new PostContentCrawler(post,
contentPattern, infoPattern));
futureList.add(future);
}
for (Future<?> future : futureList) {
try {
future.get();
} catch (InterruptedException e) {
Log.e("com.athena.asm", "excute error", e);
} catch (ExecutionException e) {
Log.e("com.athena.asm", "excute error", e);
}
}
}
public void excuteMethod() {
}
public void destroy() {
httpClient.getConnectionManager().shutdown();
execService.shutdown();
destroy = true;
}
public boolean isDestroy() {
return destroy;
}
class PostContentCrawler implements Runnable {
private Post post;
private Pattern contentPattern;
private Pattern infoPattern;
public PostContentCrawler(Post post, Pattern contentPattern,
Pattern infoPattern) {
this.post = post;
this.contentPattern = contentPattern;
this.infoPattern = infoPattern;
}
@Override
public void run() {
String url = "http://www.newsmth.net/bbscon.php?bid="
+ post.getBoardID() + "&id=" + post.getSubjectID();
HttpGet httpget = new HttpGet(url);
httpget.setHeader("User-Agent", SmthCrawler.userAgent);
httpget.addHeader("Accept-Encoding", "gzip, deflate");
String content;
try {
HttpResponse response = httpClient.execute(httpget);
HttpEntity entity = response.getEntity();
Header[] headers = response.getHeaders("Content-Encoding");
boolean isgzip = false;
if (headers != null && headers.length != 0) {
for (Header header : headers) {
String s = header.getValue();
if (s.contains("gzip")) {
isgzip = true;
}
}
}
if (isgzip) {
InputStream is = entity.getContent();
BufferedReader br = new java.io.BufferedReader(
new InputStreamReader(new GZIPInputStream(is),
SmthCrawler.smthEncoding));
String line;
StringBuilder sb = new StringBuilder();
while ((line = br.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
br.close();
content = sb.toString();
} else {
content = EntityUtils.toString(entity,
SmthCrawler.smthEncoding);
}
} catch (IOException e) {
Log.d("com.athena.asm", "get url failed,", e);
return;
}
Matcher contentMatcher = contentPattern.matcher(content);
if (contentMatcher.find()) {
String contentString = contentMatcher.group(1);
Object[] objects = StringUtility
.parsePostContent(contentString);
post.setContent((String) objects[0]);
post.setDate((java.util.Date) objects[1]);
}
if (content == null) {
return;
}
Matcher infoMatcher = infoPattern.matcher(content);
if (infoMatcher.find()) {
post.setSubjectID(infoMatcher.group(1));
post.setTopicSubjectID(infoMatcher.group(2));
post.setTitle(infoMatcher.group(5));
}
String bid = null, id = null, ftype = null, num = null, cacheable = null;
Matcher attachPartOneMatcher = Pattern.compile(
"attWriter\\((\\d+),(\\d+),(\\d+),(\\d+),(\\d+)").matcher(
content);
if (attachPartOneMatcher.find()) {
bid = attachPartOneMatcher.group(1);
id = attachPartOneMatcher.group(2);
ftype = attachPartOneMatcher.group(3);
num = attachPartOneMatcher.group(4);
cacheable = attachPartOneMatcher.group(5);
}
ArrayList<Attachment> attachFiles = new ArrayList<Attachment>();
Matcher attachPartTwoMatcher = Pattern.compile(
"attach\\('([^']+)', (\\d+), (\\d+)\\)").matcher(content);
while (attachPartTwoMatcher.find()) {
Attachment innerAtt = new Attachment();
innerAtt.setBid(bid);
innerAtt.setId(id);
innerAtt.setFtype(ftype);
innerAtt.setNum(num);
innerAtt.setCacheable(cacheable);
innerAtt.setMobileType(false);
String name = attachPartTwoMatcher.group(1);
String len = attachPartTwoMatcher.group(2);
String pos = attachPartTwoMatcher.group(3);
innerAtt.setName(name);
innerAtt.setLen(len);
innerAtt.setPos(pos);
attachFiles.add(innerAtt);
}
post.setAttachFiles(attachFiles);
}
}
}