package org.softeg.slartus.forpdaapi; import android.text.Html; import android.text.TextUtils; import org.softeg.slartus.forpdacommon.Functions; import java.io.IOException; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Created by slinkin on 20.02.14. */ public class NewsList extends ArrayList<News> { private IHttpClient mClient; private CharSequence mSearchTag; private CharSequence mLastNewsUrl; private int mLastNewsPage; private int newsCountInt; /** * Сколько всего новостей на сайте * * @return */ public int getNewsCount() { return newsCountInt; } /** * Возвращает из url тэг * * @param url * @return news, articles, software и тд. или пусто для "Все" */ private static String getSearchTag(String url) { Matcher m = Pattern.compile("4pda.ru/tag/(.*?)(/|$)").matcher(url); if (m.find()) { return "tag/" + m.group(1) + "/"; } return url; } /** * @param client * @param newsUrl - урл страницы новостей */ public NewsList(IHttpClient client, String newsUrl) { mClient = client; mSearchTag = getSearchTag(newsUrl); } public void loadNextNewsPage() throws IOException, ParseException { if (size() == 0) { getPage(1, "http://4pda.ru/" + mSearchTag); return; } mLastNewsUrl = size() > 0 ? get(size() - 1).getId() : ""; mLastNewsPage = size() > 0 ? get(size() - 1).getPage() : 0; CharSequence url = mLastNewsUrl; if (TextUtils.isEmpty(mSearchTag)) { Matcher m = Pattern.compile("4pda.ru/(\\d+)/(\\d+)/(\\d+)/(\\d+)").matcher(url); m.find(); int year = Integer.parseInt(m.group(1)); int nextPage = mLastNewsPage + 1; loadPage(year, nextPage, 0); } else { int nextPage = mLastNewsPage + 1; getPage(nextPage, "http://4pda.ru/" + mSearchTag + "page/" + nextPage); } } private void loadPage(int year, int nextPage, int iteration) throws IOException, ParseException { String dailyNewsUrl = "http://4pda.ru/" + year + "/page/" + nextPage; String dailyNewsPage = getPage(nextPage, dailyNewsUrl); if (size() == 0) { if (iteration > 0) return; if (dailyNewsPage.contains("По указанным параметрам не найдено ни одного поста")) loadPage(year - 1, 1, iteration + 1); else loadPage(year, nextPage + 1, iteration + 1); } } private int lastPageNum(String pagebody, int curPage) { Matcher m = Pattern.compile("<div class=\"wp-pagenavi\">.*<a href=\".*?/page/(\\d+)/\"\\s+class=\"page\".*?</div>").matcher(pagebody); if (m.find()) { int newsPerPage = size() / curPage; return Integer.parseInt(m.group(1)) * newsPerPage; } return getNewsCount(); } private String getPage(int page, String newsUrl) throws IOException, ParseException { String dailyNewsPage = mClient.performGet(newsUrl); Matcher postsMatcher = Pattern.compile("<div class=\"post\" id=\"post-\\d+\">([\\s\\S]*?)<span id=\"ka_\\d+_0_n\"></span></div><br /></div></div>") .matcher(dailyNewsPage); Boolean someUnloaded = false;// одна из новостей незагружена - значит и остальные SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yy"); Pattern mPattern = Pattern.compile("<a href=\"(/\\d+/\\d+/\\d+/(\\d+))/\" rel=\"bookmark\" title=\"(.*?)\" alt=\"\">.*?</a></h2>"); Pattern textPattern = Pattern.compile("<div class=\"entry\" id=\"[^\"]*\">" + "(?:<a href=\"([^\"]*)\" class=\"oprj ([^\"]*)\"><div>(.*?)</div>)?" + "([\\s\\S]*?)" + "(?:<noindex><span class=\"mb_source\">Источник: <a href=\"([^\"]*)\" target=\"[^\"]*\">(.*?)</a></span></noindex>)?" + "<br /><br /></div><div class=\"postmetadata\" id=\"ka_meta_\\d+_0\"><span id=\"ka_\\d+_0\"></span> \\| " + "<strong>(.*?)</strong> \\|\\s*(\\d+\\.\\d+\\.\\d+)\\s*\\|\\s*" + "<a href=\"/\\d+/\\d+/\\d+/\\d+/#comments\" title=\"[^\"]*\"><b class=\"spr pc\"></b>\\s*(\\d+)\\s*</a>"); Pattern imagePattern = Pattern.compile("<center><img[^>]*?src=\"(.*?)\""); while (postsMatcher.find()) { String postData = postsMatcher.group(1); Matcher m = mPattern.matcher(postData); if (m.find()) { String id = "http://4pda.ru" + m.group(1); if (!someUnloaded && findByTitle(id) != null) continue; someUnloaded = true; News news = new News(id, Html.fromHtml(m.group(3)).toString()); Matcher textMatcher = textPattern.matcher(postData); if (textMatcher.find()) { if (textMatcher.group(1) != null) { news.setTagLink(textMatcher.group(1)); news.setTagName(textMatcher.group(2)); news.setTagTitle(textMatcher.group(3)); } if (textMatcher.group(5) != null) { news.setSourceUrl(textMatcher.group(5)); news.setSourceTitle(textMatcher.group(6)); } news.setDescription(Html.fromHtml(removeDescriptionTrash(textMatcher.group(4))).toString()); news.setAuthor(Html.fromHtml(textMatcher.group(7))); Date _pubDate = dateFormat.parse(textMatcher.group(8)); news.setNewsDate(Functions.getForumDateTime(_pubDate)); news.setCommentsCount(Integer.parseInt(textMatcher.group(9))); } Matcher imageMatcher = imagePattern.matcher(postData); if (imageMatcher.find()) { news.setImgUrl(imageMatcher.group(1)); } news.setPage(page); add(news); } } newsCountInt = Math.max(getNewsCount(), lastPageNum(dailyNewsPage, page)); return dailyNewsPage; } /** * Удалить из краткого текста новости ссылки "читать дальше" и картинки * * @return */ private static String removeDescriptionTrash(CharSequence description) { return Pattern .compile("<p style=\"[^\"]*\"><a href=\"/\\d+/\\d+/\\d+/\\d+/#more-\\d+\" class=\"more-link\">читать дальше</a></p>|<img[^>]*?/>") .matcher(description) .replaceAll("").trim(); } public News findByTitle(String title) { title = title.toLowerCase().replace(" ", ""); for (int i = 0; i < size(); i++) { News topic = get(i); if (topic.getTitle().toString().replace(" ", "").equalsIgnoreCase(title)) return topic; } return null; } }