import com.github.abola.crawler.CrawlerPack;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.ArrayList;
import java.util.List;
/**
* 範例: 使用爬蟲包取得八卦版最後50篇文章的網址
*/
class PttGossiping {
final static String gossipMainPage = "https://www.ptt.cc/bbs/Gossiping/index.html";
final static String gossipIndexPage = "https://www.ptt.cc/bbs/Gossiping/index%s.html";
// 取得最後幾篇的文章數量
static Integer loadLastPosts = 50;
public static void main(String[] argv){
String prevPage =
CrawlerPack.start()
.addCookie("over18","1") // 八卦版進入需要設定cookie
.getFromHtml(gossipMainPage) // 遠端資料格式為 HTML
.select(".action-bar .pull-right > a") // 取得右上角『前一頁』的內容
.get(1).attr("href")
.replaceAll("/bbs/Gossiping/index([0-9]+).html", "$1");
// 目前最末頁 index 編號
Integer lastPage = Integer.valueOf(prevPage)+1;
List<String> lastPostsLink = new ArrayList<String>();
while ( loadLastPosts > lastPostsLink.size() ){
String currPage = String.format(gossipIndexPage, lastPage--);
Elements links =
CrawlerPack.start()
.addCookie("over18", "1")
.getFromHtml(currPage)
.select(".title > a");
for( Element link: links) lastPostsLink.add( link.attr("href") );
}
// 檢視結果
for(String url : lastPostsLink){
System.out.println(url);
}
}
}