package com.formulasearchengine.mathosphere.mathpd.pojos;
import com.formulasearchengine.mathmltools.xmlhelper.NonWhitespaceNodeList;
import com.formulasearchengine.mathmltools.xmlhelper.XMLHelper;
import com.google.common.collect.HashMultiset;
import com.google.common.collect.Multiset;
import org.apache.commons.lang3.StringUtils;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import javax.xml.xpath.XPathExpressionException;
import java.util.ArrayList;
import java.util.List;
public class ArxivDocument {
public String title;
public String text;
// optional
public String name;
public String page;
public ArxivDocument() {
}
public ArxivDocument(String title, String text) {
this.title = title;
this.text = text;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getPage() {
return page;
}
public void setPage(String page) {
this.page = page;
}
public Document getDoc() {
return XMLHelper.String2Doc(text, true);
}
public NonWhitespaceNodeList getMathTags() throws XPathExpressionException {
return new NonWhitespaceNodeList(XMLHelper.getElementsB(getDoc(), "//*:math"));
}
public Multiset<String> getCElements() throws XPathExpressionException {
final Multiset<String> identifiersFromCmml = HashMultiset.create();
for (Node n : getMathTags()) {
identifiersFromCmml.addAll(XMLHelper.getIdentifiersFromCmml(n));
}
return identifiersFromCmml;
}
/**
* Returns an ordered list of all Content-Element leaf nodes of this document.
*
* @return
* @throws XPathExpressionException
*/
public List<Node> getCElementLeafNodes() throws XPathExpressionException {
final List<Node> leafNodes = new ArrayList<>();
for (Node n : getMathTags()) {
final NodeList tmpLeafNodes = XMLHelper.getLeafNodesFromCmml(n);
for (int i = 0; i < tmpLeafNodes.getLength(); i++) {
leafNodes.add(tmpLeafNodes.item(i));
}
}
return leafNodes;
}
@Override
public String toString() {
return "[title=" + title + ", text=" + StringUtils.abbreviate(text, 100) + "]";
}
}