package com.aiyou.news.utils; import java.net.URLEncoder; import java.util.ArrayList; import java.util.regex.MatchResult; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.aiyou.utils.AiYouManager; public class NewsManager { // 信息门户地址 public static final String URL_INFO_HEAD = "http://portal.bupt.edu.cn"; // 信息门户通告 public static final String URL_INFO_INFORM = "http://portal.bupt.edu.cn/sites/main/column_by_upper.jsp?ColumnID=38&page="; // 信息门户新闻 public static final String URL_INFO_NEWS = "http://portal.bupt.edu.cn/sites/main/column_by_upper.jsp?ColumnID=37&page="; // 北邮要闻地址 public static final String URL_HEADLINE_HEAD = "http://www.bupt.edu.cn"; // 北邮要闻 public static final String URL_HEADLINE = "http://www.bupt.edu.cn/list/list.php?p=81_15_"; // 北邮要闻缩放比例 public static final int mHeadlineScaleSize = 175; // 信息门户缩放比例 public static final int mNewsScaleSize = 150; /** * 匹配网页内容—信息息门户 * * @param htmlSource 获取的html源码 * @return 封装好的新闻标题列表 */ public static News getNewsTitle(String htmlSource) { ArrayList<News> result = new ArrayList<>(); News news; Pattern p = Pattern .compile("href=\"(.*?)\" target=\"_blank\" title=\"(.*?)\""); Matcher m = p.matcher(htmlSource); while (m.find()) { MatchResult mr = m.toMatchResult(); news = new News(); news.title = mr.group(2); String url = mr.group(1); if (!url.contains(URL_INFO_HEAD)) { news.url = URL_INFO_HEAD + url; } else { news.url = url; } result.add(news); } int length = result.size(); int index = 0; p = Pattern.compile("<dd>(\\d{4}\\-\\d{2}\\-\\d{2})</dd>"); m = p.matcher(htmlSource); while (m.find()) { MatchResult mr = m.toMatchResult(); news = result.get(index); news.date = mr.group(1); index++; if (index >= length - 1) { break; } } index = 0; p = Pattern .compile("<dt style=\"width:110px;overflow: hidden;text-overflow: ellipsis;white-space: nowrap;\">(.*?)</dt>"); m = p.matcher(htmlSource); while (m.find()) { MatchResult mr = m.toMatchResult(); news = result.get(index); String from = mr.group(1); if (AiYouManager.getTxtWithoutNTSRElement(from, "") != null) { news.from = from; } index++; if (index > length - 1) { break; } } news = new News(); news.list = result; return news; } /** * 获取信息门户新闻、通告内容 * * @param htmlSource * @return */ public static String getNewsContent(String htmlSource) { String result = htmlSource; Pattern p = Pattern.compile("data_ue_src=\".*?\""); Matcher m = p.matcher(result); while (m.find()) { result = result.replace(m.group(), ""); } p = Pattern .compile("<span objparam=\"fieldname:Content\" tag=\"_ddfield\" objid=\"6044\" >([\\s\\S]*)</span>"); m = p.matcher(result); while (m.find()) { MatchResult mr = m.toMatchResult(); result = mr.group(1); } // 去掉前面的空白字符 p = Pattern.compile("^[\\s]|[\t]|[\r]|[\n]|[?]"); m = p.matcher(result); result = m.replaceAll(""); // 修正图片、连接地址 result = completeHerf(result, URL_INFO_HEAD); result = completeImgSrc(result, URL_INFO_HEAD); return result; } /** * 匹配网页内容—北邮要闻 * * @param htmlSource 获取的html源码 * @return 封装好的新闻标题列表 */ public static News getHeadlineTitle(String htmlSource) { ArrayList<News> result = new ArrayList<>(); News news; // 获取新闻主体 String[] arr1 = htmlSource.split("<ul class=\"ovhi\">"); if (arr1.length >= 2) { htmlSource = arr1[1]; } else { return null; } String[] arr2 = htmlSource.split("</ul>"); htmlSource = arr2[0]; // 获取连接和标题 Pattern p = Pattern .compile("href=\"(.*?)\" title=\"(.*?)\"><[^>]*?>(.*?)</font>"); Matcher m = p.matcher(htmlSource); while (m.find()) { MatchResult mr = m.toMatchResult(); news = new News(); String url = mr.group(1); if (!url.contains(URL_HEADLINE_HEAD)) { news.url = URL_HEADLINE_HEAD + url; } else { news.url = url; } // strTitle = getStringFromSign(mr.group(2)); news.title = mr.group(2); // String[] arr = strTitle.split(" "); news.date = mr.group(3); result.add(news); } news = new News(); news.list = result; return news; } /** * 获取北邮要闻新闻内容 * * @param htmlSource * @return */ public static String getHeadlineContent(String htmlSource) { String result = htmlSource; Pattern p = Pattern.compile("data_ue_src=\".*?\""); Matcher m = p.matcher(result); while (m.find()) { result = result.replace(m.group(), ""); } p = Pattern.compile("<div class=\"content detail\">([\\s\\S]*?)</div>"); m = p.matcher(result); while (m.find()) { MatchResult mr = m.toMatchResult(); result = mr.group(1); } // 去掉前面的空白字符 p = Pattern.compile("^[\\s]|[\t]|[\r]|[\n]|[?]"); m = p.matcher(result); result = m.replaceAll(""); // 修正图片、连接地址 result = completeHerf(result, URL_HEADLINE_HEAD); result = completeImgSrc(result, URL_HEADLINE_HEAD); return result; } /** * 将路径中的中文名进行编码,以获取正确的路径 * * @param path * @return */ @SuppressWarnings("deprecation") public static String imgPathEncoder(String path) { String convertPath = ""; String arr[] = path.split("/"); int length = arr.length; for (int i = 0; i < length - 1; i++) { convertPath += arr[i] + "/"; } convertPath += URLEncoder.encode(arr[length - 1]); return convertPath; } /** * 将图片的地址补全 * * @param html 图片的相对地址 * @param urlPath 要补充的前缀 * @return 返回绝对地址 */ private static String completeImgSrc(String html, String urlPath) { String str = html; String strarr[] = str.split("src=\""); str = strarr[0]; for (int i = 1; i < strarr.length; i++) {// 将图片的路径补充完全 if ("http".equals(strarr[i].substring(0, 4))) { str += "src=\"" + strarr[i]; } else { str += "src=\"" + urlPath + strarr[i]; } } return str; } /** * 将链接的地址补全 * * @param html 相对地址 * @param urlPath 要添加的前缀 * @return 绝对地址 */ private static String completeHerf(String html, String urlPath) { String str = html; String strarr[] = str.split("href=\""); str = strarr[0]; for (int i = 1; i < strarr.length; i++) {// 将链接的路径补充完全 if ("http".equals(strarr[i].substring(0, 4))) { str += "href=\"" + strarr[i]; } else { str += "href=\"" + urlPath + strarr[i]; } } return str; } }