/******************************************************************************
* Copyright (c) 2010 Basis Technology Corp.
*
* Basis Technology Corp. licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package com.basistech.readability;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.ListIterator;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.TextNode;
/**
* Maintains map between PC-DATA offsets and Text nodes in an XML document. Provides some structure for the
* process of pulling the data out.
*/
public abstract class XmlDataMap {
protected char forceSentenceChar = '\u2029'; // paragraph
/**
* Classify elements in the tree.
*/
public enum ElementAction {
/**
* Ignore text under here.
*/
Banned,
/**
* Not currently used.
*/
Alt,
/**
* Insert whitespace.
*/
Whitespace,
/**
* Treat as sentence boundary.
*/
Sentence,
/**
* Remember where this was.
*/
Mark;
}
public static class Mark {
private String tag;
private int offset;
/**
* * @return Returns the tag.
*/
public String getTag() {
return tag;
}
/**
* @param tag The tag to set.
*/
public void setTag(String tag) {
this.tag = tag;
}
/**
* * @return Returns the offset.
*/
public int getOffset() {
return offset;
}
/**
* @param offset The offset to set.
*/
public void setOffset(int offset) {
this.offset = offset;
}
}
protected StringBuffer pcDataBuffer;
private List<Mark> marks;
// CHECKSTYLE:OFF
private LinkedList<OffsetRange> offsetRanges;
// CHECKSTYLE:ON
private int pcDataOffset;
private boolean justAppendedSpace;
private boolean justAppendedPeriod;
private ListIterator<OffsetRange> optimizedListPointer;
private OffsetRange optimizedRangeListElement;
protected XmlDataMap() {
offsetRanges = new LinkedList<OffsetRange>();
pcDataOffset = 0;
pcDataBuffer = new StringBuffer();
justAppendedSpace = false;
justAppendedPeriod = false;
marks = new ArrayList<Mark>();
}
public OffsetRange findOffsetRangeForOffset(int offset) {
if (optimizedRangeListElement.offsetInRange(offset)) {
return optimizedRangeListElement;
} else if (offset > optimizedRangeListElement.getStart()) {
while (optimizedRangeListElement.getStart() < offset && optimizedListPointer.hasNext()) {
optimizedRangeListElement = optimizedListPointer.next();
if (optimizedRangeListElement.offsetInRange(offset)) {
return optimizedRangeListElement;
}
}
throw new RuntimeException("Offset " + offset + " beyond last range");
} else {
// we don't expect to exercise this case.
// has to be smaller, no?
while (offset < optimizedRangeListElement.getStart() && optimizedListPointer.hasPrevious()) {
optimizedRangeListElement = optimizedListPointer.previous();
if (optimizedRangeListElement.offsetInRange(offset)) {
return optimizedRangeListElement;
}
}
throw new RuntimeException("Offset " + offset + " before the first offset");
}
}
/**
* Retrieve the offset ranges for the text nodes of the original tree.
*
* @return the ranges.
*/
public List<OffsetRange> getOffsetRanges() {
return offsetRanges;
}
public String getPcData() {
return pcDataBuffer.toString();
}
/**
* Retrieve the accumulated pc data.
*
* @return
*/
public StringBuffer getPcDataBuffer() {
return pcDataBuffer;
}
/**
* If we need to split a range for annotation, we want to keep the map of offset ranges usable. Note that
* the caller has to revalidate or maintain any indices it has grabbed for ranges after the one we are
* splitting. Note that this does not insert the new text node into the parent contents, the caller does
* that.
*
* @param range
* @param splitPoint
* @return
*/
public TextNode splitText(int rangePoint, int splitPoint) {
assert splitPoint > 0;
OffsetRange range = offsetRanges.get(rangePoint);
assert splitPoint < range.getText().text().length();
TextNode newText = new TextNode(range.getText().text().substring(splitPoint), null);
range.getText().text(range.getText().text().substring(0, splitPoint));
OffsetRange newRange = new OffsetRange(range.getStart() + splitPoint, range.getEnd(), newText);
offsetRanges.add(rangePoint + 1, newRange);
range.setEnd(splitPoint + range.getStart());
assert range.getText().text().length() == range.getEnd() - range.getStart();
return newText;
}
/**
* A subclass may process metadata into the pc-data stream by calling this directly.
*
* @param textObject
* @param text
*/
protected void append(TextNode textObject, String text) {
// if an entire Text element is whitespace, chances are that it's <div>NL noise. We don't need it.
boolean spaceText = text.matches("[\\s]*");
if (spaceText && justAppendedSpace) {
return;
}
if (spaceText) {
justAppendedSpace = true;
}
OffsetRange offsetRange = new OffsetRange(pcDataOffset, pcDataOffset + text.length(), textObject);
pcDataBuffer.append(text);
pcDataOffset += text.length();
justAppendedSpace = Character.isWhitespace(text.charAt(text.length() - 1));
justAppendedPeriod = eosPunctuation(lastNonWhitepaceCharacter(text));
offsetRanges.add(offsetRange);
}
protected char lastNonWhitepaceCharacter(String text) {
for (int index = text.length() - 1; index >= 0; index--) {
char c = text.charAt(index);
if (!Character.isWhitespace(c)) {
return c;
}
}
return '\ufeff'; // it won't count as punctuation
}
//SK: allow quotes to be considered as EOS punctuation, so that we don't
// add extra punctuation to sentences ending with quotes. This isn't
// entirely unicode-friendly, and we may want to fix that someday.
private static boolean eosPunctuation(char c) {
String s = "!?.\u2029\"\u0027\u2018\u2019\u201c\u201d";
return s.indexOf(c) != -1;
}
protected void appendPeriod() {
int startPcDataOffset = pcDataOffset;
if (!justAppendedSpace && !justAppendedPeriod) {
String appendMe = " . " + System.getProperty("line.separator");
pcDataBuffer.append(appendMe);
pcDataOffset += appendMe.length();
justAppendedPeriod = true;
justAppendedSpace = true;
} else if (!justAppendedPeriod) {
String appendMe = ". " + System.getProperty("line.separator");
pcDataBuffer.append(appendMe);
pcDataOffset += appendMe.length();
justAppendedPeriod = true;
justAppendedSpace = true;
} else if (!justAppendedSpace) {
String appendMe = " " + System.getProperty("line.separator");
pcDataBuffer.append(appendMe);
pcDataOffset += appendMe.length();
justAppendedPeriod = true;
justAppendedSpace = true;
}
// we make a range so that the code can tell the difference between 'spurious, added, period'
// and 'bug that failed to make an offset range.'
OffsetRange offsetRange = new OffsetRange(startPcDataOffset, pcDataOffset, null);
offsetRanges.add(offsetRange);
}
protected void appendSpace() {
if (!justAppendedSpace && !justAppendedPeriod) {
justAppendedSpace = true;
pcDataBuffer.append(' ');
pcDataOffset++;
}
}
protected abstract ElementAction classifyElement(Element element);
public void process(Element rootElement) {
recurse(rootElement);
optimizedListPointer = offsetRanges.listIterator();
optimizedRangeListElement = offsetRanges.getFirst();
}
private void recurse(Element element) {
ElementAction action = classifyElement(element);
if (action == ElementAction.Whitespace || action == ElementAction.Sentence) {
appendSpace();
}
for (Node childNode : element.childNodes()) {
// n.b., cdata not possible if we are coming from TagSoup. If we also handle
// real xhtml by directly parsing it, then we have another story on our hands.
// though we could use canonical XML to get rid of them.
if (childNode instanceof TextNode && action != ElementAction.Banned) {
TextNode textContent = (TextNode)childNode;
String textString = textContent.text();
append(textContent, textString);
} else if (childNode instanceof Element) {
recurse((Element)childNode);
}
}
if (action == ElementAction.Whitespace) {
appendSpace();
} else if (action == ElementAction.Sentence) {
appendPeriod();
} else if (action == ElementAction.Mark) {
Mark mark = new Mark();
mark.setOffset(pcDataOffset);
mark.setTag(element.tagName());
}
}
/**
* * @return Returns the marks.
*/
public List<Mark> getMarks() {
return marks;
}
public char getForceSentenceChar() {
return forceSentenceChar;
}
public void setForceSentenceChar(char forceSentenceChar) {
this.forceSentenceChar = forceSentenceChar;
}
}