/* LanguageTool, a natural language style checker * Copyright (C) 2013 Daniel Naber (http://www.danielnaber.de) * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 * USA */ package org.languagetool.dev.wikipedia.atom; import java.util.ArrayList; import java.util.Date; import java.util.List; import java.util.Objects; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * One item in the Atom feed, i.e. one or more differences for an article. * @since 2.4 */ class AtomFeedItem { private static final Pattern TABLE_DATA_CONTENT = Pattern.compile("<td.*?>(.*)</td>"); private static final Pattern DIFF_ID_PATTERN = Pattern.compile("diff=(\\d+)"); private final String id; private final String title; private final String summary; private final Date date; AtomFeedItem(String id, String title, String summary, Date date) { this.id = Objects.requireNonNull(id); this.title = Objects.requireNonNull(title); this.summary = Objects.requireNonNull(summary); this.date = Objects.requireNonNull(date); } String getId() { return id; } String getTitle() { return title; } String getSummary() { return summary; } Date getDate() { return date; } List<String> getOldContent() { return getMarkedContent("−"); // note: that's not the standard minus ("-") } List<String> getNewContent() { return getMarkedContent("+"); } private List<String> getMarkedContent(String plusMinusMarker) { List<String> result = new ArrayList<>(); String[] lines = summary.split("\n"); boolean expectingChange = false; for (String line : lines) { if (line.trim().startsWith("<td class=\"diff-marker\">" + plusMinusMarker + "</td>")) { expectingChange = true; } else if (expectingChange) { Matcher matcher = TABLE_DATA_CONTENT.matcher(line); if (matcher.find()) { String cleanContent = matcher.group(1); if (cleanContent.matches(".*<div.*?>[!\\|].*") && cleanContent.matches(".*\\w!!\\w.*")) { // remove ugly table syntax like "!Division!!Apps!!Goals!!Apps", triggers the whitespace rules: cleanContent = cleanContent.replaceAll("<div.*?>[!\\|].*?</div>", ""); } cleanContent = cleanContent .replaceAll("<span.*?>", "").replace("</span>", "") .replaceAll("<div.*?>[!\\|]", "").replace("<div>", "") // remove table syntax .replaceAll("<div.*?>", "").replace("</div>", "") .replaceAll("<ins.*?>", "").replace("</ins>", "") .replaceAll("<del.*?>", "").replace("</del>", "") .replaceAll("<!--.*?-->", ""); result.add(cleanContent); } else { throw new RuntimeException("Expected change ('" + plusMinusMarker + "') not found in line: " + line); } expectingChange = false; } } return result; } /** * Get the diff id from the 'id' element, or {@code 0} if the diff is the creation of a new article. */ public long getDiffId() { Matcher matcher = DIFF_ID_PATTERN.matcher(id); if (matcher.find()) { return Long.parseLong(matcher.group(1)); } else { // newly created article return 0; } } @Override public String toString() { return "AtomFeedItem{" + "id='" + id + '\'' + ", title='" + title + "\'}"; } }