package org.softeg.slartus.forpdaapi; import android.net.Uri; import android.text.Html; import android.text.TextUtils; import android.util.Log; import org.softeg.slartus.forpdacommon.DateTimeExternals; import org.softeg.slartus.forpdacommon.NotReportException; import org.softeg.slartus.forpdacommon.UrlExtensions; import org.w3c.dom.Element; import org.w3c.dom.NodeList; import org.xml.sax.InputSource; import java.io.IOException; import java.io.StringReader; import java.util.ArrayList; import java.util.Date; import java.util.regex.Matcher; import java.util.regex.Pattern; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; /* * Created by slinkin on 20.02.14. */ public class NewsApi { public static Boolean like(IHttpClient httpClient, String newsId) throws IOException { String res = httpClient.performGet("http://4pda.ru/wp-content/plugins/karma/ajax.php?p=" + newsId + "&c=0&v=1", false, false); return res != null; } public static Boolean likeComment(IHttpClient httpClient, String newsId, String postId) throws IOException { String res = httpClient.performGet("http://4pda.ru/wp-content/plugins/karma/ajax.php?p=" + newsId + "&c=" + postId + "&v=1", false, false); return res != null; } public static ArrayList<News> getNews(IHttpClient httpClient, String url, ListInfo listInfo) throws Exception { //http://4pda.ru/2013/page/7/ //http://4pda.ru/2013/2/page/7/ //http://4pda.ru/2013/2/2/page/7/ //http://4pda.ru/tag/programs-for-ios/page/3 //http://4pda.ru/page/5/ //http://4pda.ru/page/5/?s=ios - поиск //http://4pda.ru/?s=%EF%EB%E0%ED%F8%E5%F2 //http://4pda.ru/page/6/?s=%EF%EB%E0%ED%F8%E5%F2 final int NEWS_PER_PAGE = 30;// 30 новостей на страницу выводит форум int pageNum = 1; String justUrl = url;// урл без страницы и параметров String params = "";// параметры, например, s=%EF%EB%E0%ED%F8%E5%F2 // сначала проверим на поисковой урл Matcher m = Pattern.compile("(.*?)(?:page/+(\\d+)/+)?\\?(.*?)$", Pattern.CASE_INSENSITIVE) .matcher(url); if (m.find()) { justUrl = m.group(1); if (!TextUtils.isEmpty(m.group(2))) pageNum = Integer.parseInt(m.group(2)); if (!TextUtils.isEmpty(m.group(3))) params = m.group(3); } else { m = Pattern.compile("(.*?)(?:page/+(\\d+)/+)?$", Pattern.CASE_INSENSITIVE) .matcher(url); if (m.find()) { justUrl = m.group(1); if (!TextUtils.isEmpty(m.group(2))) pageNum = Integer.parseInt(m.group(2)); } } pageNum = (int) Math.ceil(listInfo.getFrom() / NEWS_PER_PAGE) + pageNum; String requestUrl = justUrl + "/page/" + pageNum + "/" + params; ArrayList<News> res = new ArrayList<>(); String dailyNewsPage = httpClient.performGet(UrlExtensions.removeDoubleSplitters(requestUrl)); Pattern articlesPattern = Pattern.compile("(<article class=\"post\"[^>]*>[\\s\\S]*?href=\"([^\"]*)\" title[\\s\\S]*?src=\"([^\"]*)\" alt=\"([^\"]*?)\"[\\s\\S]*?<\\/article>)|(<li itemscope[^>]*>[\\s\\S]*?itemprop=\"url\" href=\"([^\"]*?)\"[\\s\\S]*?src=\"([^\"]*?)\" alt=\"([^\"]*?)\"[\\s\\S]*?<\\/div>[^<]*<\\/li>)"); Pattern descriptionPattern = Pattern.compile("(<div itemprop=\"description\">[\\s\\S]*?<p [^>]*>([\\s\\S]*)<\\/p>[^<]*)|(<div itemprop=\"description\">([\\s\\S]*?)<\\/div>)"); Pattern labelPattern = Pattern.compile("<a href=\"([^\"]*)\" class=\"label[^>]*>([\\s\\S]*?)<\\/a>"); Pattern countPattern = Pattern.compile("class=\"v-count\"[^>]*>(\\d*)</a>"); Pattern datePattern = Pattern.compile("<meta itemprop=\"datePublished\" content=\"(\\d+-\\d+-\\d+)[\\s\\S]*?\"\\/>"); Pattern authorPattern = Pattern.compile("(<span class=\"autor\"><a [^>]*>([^<]*)</a>)|(<meta itemprop=\"author\" content=\"([^\"]*)\"/>)"); m = articlesPattern.matcher(dailyNewsPage); Matcher matcher = null; //SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yy"); News news; int group, childGroup; while (m.find()) { news = new News(); group = 0; childGroup = 0; if (m.group(1) == null) group = 4; news.setId(m.group(group + 2).replace("http://4pda.ru", "")); news.setTitle(Html.fromHtml(m.group(group + 4).replaceAll("&","&")).toString()); news.setImgUrl(m.group(group + 3)); if (matcher == null) matcher = descriptionPattern.matcher(m.group(group + 1)); else matcher.usePattern(descriptionPattern).reset(m.group(group + 1)); if (matcher.find()) { if (matcher.group(1) == null) childGroup = 2; news.setDescription(Html.fromHtml(matcher.group(childGroup + 2).replaceAll("<a [^>]*>([^<]*)</a>", "$1")).toString().trim()); } childGroup = 0; matcher.usePattern(labelPattern).reset(m.group(group + 1)); if (matcher.find()) { news.setTagLink(matcher.group(1)); news.setTagTitle(Html.fromHtml(matcher.group(2).trim())); } else { news.setTagTitle(""); } matcher.usePattern(countPattern).reset(m.group(group + 1)); if (matcher.find()) { news.setCommentsCount(Integer.parseInt(matcher.group(1))); } matcher.usePattern(datePattern).reset(m.group(group + 1)); if (matcher.find()) { news.setNewsDate(matcher.group(1)); } matcher.usePattern(authorPattern).reset(m.group(group + 1)); if (matcher.find()) { if (matcher.group(1) == null) childGroup = 2; news.setAuthor(matcher.group(childGroup + 2)); } res.add(news); } if (res.size() == 0 && pageNum == 1 && listInfo.getFrom() == 0) return getNewsFromRss(httpClient, UrlExtensions.removeDoubleSplitters(url + "/feed/")); int lastPageNum = lastPageNum(dailyNewsPage); listInfo.setOutCount(res.size() * lastPageNum); return res; } private static String normalizeRss(String body) { return body.replaceAll("&(?!.{1,4};)", "&"); } public static ArrayList<News> getNewsFromRss(IHttpClient httpClient, String url) throws Exception { ArrayList<News> res = new ArrayList<>(); try { String body = httpClient.performGet(url); if (TextUtils.isEmpty(body)) throw new NotReportException("Сервер вернул пустую страницу!"); DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance(); DocumentBuilder db = dbf.newDocumentBuilder(); body = normalizeRss(body); org.w3c.dom.Document document = db.parse(new InputSource(new StringReader(body))); Element element = document.getDocumentElement(); NodeList nodeList = element.getElementsByTagName("item"); if (nodeList.getLength() > 0) { for (int i = 0; i < nodeList.getLength(); i++) { Element entry = (Element) nodeList.item(i); Element _titleE = (Element) entry.getElementsByTagName("title").item(0); Element _descriptionE = (Element) entry.getElementsByTagName("description").item(0); Element _pubDateE = (Element) entry.getElementsByTagName("pubDate").item(0); Element _linkE = (Element) entry.getElementsByTagName("link").item(0); StringBuilder _title = new StringBuilder(); NodeList nodes = _titleE.getChildNodes(); int nodesLength = nodes.getLength(); for (int c = 0; c < nodesLength; c++) { _title.append(nodes.item(c).getNodeValue()); } //String _description = _descriptionE.getFirstChild().getNodeValue(); StringBuilder _description = new StringBuilder(); nodes = _descriptionE.getChildNodes(); nodesLength = nodes.getLength(); for (int c = 0; c < nodesLength; c++) { _description.append(nodes.item(c).getNodeValue().replace("\n", " ")); } Date _pubDate = new Date(_pubDateE.getFirstChild().getNodeValue()); String _link = _linkE.getFirstChild().getNodeValue(); String author = entry.getElementsByTagName("dc:creator").item(0).getChildNodes().item(0).getNodeValue(); News news = new News(Uri.parse(_link).getPath(), _title.toString()); news.setNewsDate(DateTimeExternals.getDateString(_pubDate)); news.setAuthor(author); news.setDescription(_description.toString().replaceAll("(<img.*?/>)", "")); res.add(news); } } } catch (Throwable ex) { Log.e("NewsApi", ex.toString()); } return res; } private static int lastPageNum(String pagebody) { Matcher m = Pattern.compile("<ul class=\"page-nav\">.*href=\"[\\s\\S]*/+page/+(\\d+)/+\">\\d+.*?</ul>").matcher(pagebody); if (m.find()) { return Integer.parseInt(m.group(1)); } return 1; } }