package org.wikipedia.page.linkpreview;
import org.w3c.dom.Document;
import org.w3c.dom.NodeList;
import org.wikipedia.page.Page;
import org.wikipedia.util.StringUtil;
import org.wikipedia.util.log.L;
import org.xml.sax.InputSource;
import java.io.StringReader;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
/**
* Logic to obtain an extract text from a page. This extract text should be suitable to display
* it as plain text in the LinkPreview dialog.
*/
public class PageExtract {
private final Page page;
private String text;
public PageExtract(Page page) {
this.page = page;
}
public String getText() {
if (text == null) {
text = extractTextFromPage();
}
return text;
}
private String extractTextFromPage() {
NodeList elements = getXmlChildren(page.getSections().get(0).getContent());
if (elements == null) {
return "";
}
String firstSection = "";
// First, extract the text of all the <p> tags from the first section
for (int i = 0; i < elements.getLength(); i++) {
if (elements.item(i).getNodeName().equalsIgnoreCase("p")) {
firstSection += elements.item(i).getTextContent() + " ";
}
}
// Strip the unwanted XML
firstSection = StringUtil.fromHtml(firstSection).toString();
// Strip the reference texts ([1], [2]...)
firstSection = firstSection.replaceAll("\\[\\d+\\]", "");
return firstSection;
}
/**
* Parse the given HTML string and return a list of the immediate XML child nodes of that HTML.
*
* @param html HTML contents.
* @return The list of XML child nodes, or null if there was an error.
*/
private NodeList getXmlChildren(String html) {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder;
try {
builder = factory.newDocumentBuilder();
} catch (ParserConfigurationException e) {
L.e(e);
return null;
}
InputSource inputSource = new InputSource(new StringReader("<dummy>" + html + "</dummy>"));
Document document;
try {
document = builder.parse(inputSource);
} catch (Exception e) {
L.e(e);
return null;
}
return document.getFirstChild().getChildNodes();
}
}