/* * Copyright (C) 2011 4th Line GmbH, Switzerland * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as * published by the Free Software Foundation, either version 3 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. */ package org.fourthline.lemma.reader.content.filter; import org.fourthline.lemma.Constants; import org.seamless.xhtml.Option; import org.fourthline.lemma.anchor.CitationAnchor; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * Removes fragment labels, whitespaces, etc. from citation source text. * * @author Christian Bauer */ public class CleanupFilter implements ContentFilter { final private Logger log = Logger.getLogger(CleanupFilter.class.getName()); final private Pattern fragmentLabelPattern; public CleanupFilter(Pattern fragmentLabelPattern) { this.fragmentLabelPattern = fragmentLabelPattern; } public Pattern getFragmentLabelPattern() { return fragmentLabelPattern; } public String[] filter(String[] source, CitationAnchor citation) { if (source == null || source.length == 0) return source; log.fine("Cleaning (removing labels, whitespace, escaping) source lines: " + source.length); List<String> cleanLines = new ArrayList(); // Count whitespaces of first line String firstline = source[0]; int spaces = 0; while (firstline.matches("^\\s+.*")) { firstline = firstline.substring(1); spaces++; } for (int i = 0; i < source.length; i++) { String line = source[i]; // Remove fragment label from line, or remove the whole line Option cleanLabelsOption = citation.getOption(CitationAnchor.OptionKey.CLEAN_LABELS); if (cleanLabelsOption == null || cleanLabelsOption.isTrue()) { line = removeFragmentComment(line); if (line == null) continue; } // Remove fragment label from line, or remove whole line, if it's the first or last line (boundary of fragment) if (cleanLabelsOption != null && cleanLabelsOption.getFirstValue() != null && cleanLabelsOption.getFirstValue().toLowerCase().equals("boundary") && (i == 0 || i == source.length-1)) { line = removeFragmentComment(line); if (line == null) continue; } Option ltrimOption = citation.getOption(CitationAnchor.OptionKey.LTRIM); if (ltrimOption == null || ltrimOption.isTrue()) { // Remove white spaces from beginning of line (if there are that many spaces at the beginning of the line) line = line.length() > spaces && line.matches("^\\s{"+spaces+",}.*") ? line.substring(spaces) : line; } // Escape XHTML reserved characters // TODO line = XHTMLParser.escape(line); cleanLines.add(line); } List<String> strippedLines = new ArrayList(); Option lineTrimOption = citation.getOption(CitationAnchor.OptionKey.LINE_TRIM); if (lineTrimOption == null || lineTrimOption.isTrue()) { for (int i = 0; i < cleanLines.size(); i++) { String line = cleanLines.get(i); // If this line is only whitespace and the next line is only whitespace, drop this line if (line.matches("\\s*") && cleanLines.size() > i+1 && cleanLines.get(i+1).matches("\\s*")) { continue; } strippedLines.add(line); } } else { strippedLines = cleanLines; } return strippedLines.toArray(new String[strippedLines.size()]); } protected String removeFragmentComment(String line) { Matcher m = getFragmentLabelPattern().matcher(line); if (m.matches()) { // Preserve CALLOUT labels if (m.group(2).equals(Constants.CALLOUT_LABEL)) return line; String cleanLine = m.group(1); // Remove trailing whitespace then return remaining before-comment text while(cleanLine.matches(".*( |\\t)$")) { cleanLine = cleanLine.substring(0, cleanLine.length()-1); } // Well if nothing is left, we remove the whole line (returning null does that) return cleanLine.length() == 0 ? null : cleanLine; } return line; } }