package com.cellasoft.univrapp.utils; import com.cellasoft.univrapp.Settings; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; public class Html { private static final Pattern bodyPattern = Pattern.compile( "<body[^>]*?>([\\s\\S]*?)</body>", Pattern.CASE_INSENSITIVE); private static final Pattern fileExtnPtrn = Pattern .compile("([^\\s]+(\\.(?i)(txt|rtf|doc|docx|docm|odt|ppt|pptx|xlt|xltx|xltm|pps|ppsx|ods|pdf|xls|zip|rar|tar|7z|R))$)"); public static String getBodyContent(String dirtyHtml) { Matcher matcher = bodyPattern.matcher(dirtyHtml); if (matcher.find()) { return matcher.group(1); } return null; } public static String toText(String html) { if (html == null) return html; return html.replaceAll("</?[^>]+>", "").trim(); } public static String decode(String html) { if (html == null) return html; return html.replace("<", "<").replace(">", ">") .replace(""", "\"").replace("'", "'") .replace(" ", " ").replace("&", "&") .replace("—", "").replace("'", "'"); } public static boolean validateFileExtn(String html) { Matcher matcher = fileExtnPtrn.matcher(html); return matcher.matches(); } public static List<String> getAttachment(String html) { List<String> attachment = Lists.newArrayList(); Document doc = Jsoup.parse(html); Elements files = doc.select("dl.docTab > dd > ul.formati > li > a"); for (Element file : files) { if (file != null) { String path = Settings.getUniversity().domain + file.attr("href"); String text = decode(file.text().trim()); String attach = ""; int start = text.lastIndexOf("("); if (start > 0) { String size = text.substring(start, text.length()); attach = "<a href=\"" + path + "\">" + text.replace(size, "") + "</a></div>"; attach += "<br/><small>" + size + "</small></td>"; } else { attach = "<a href=\"" + path + "\">" + text + "</a></div>"; } attach += "</td>"; attachment.add(attach); } } return attachment; } public static String getFileNameToPath(String path) { int start = path.lastIndexOf("/") + 1; return path.substring(start); } public static String parserPage(String html) { String body = getBodyContent(html); Document doc = Jsoup.parse(body); try { Element article = doc.select("div.sezione").first(); return article.html(); } catch (Exception e) { return ""; } finally { doc = null; } } }