package com.formulasearchengine.mathosphere.mathpd; import com.formulasearchengine.mathosphere.mathpd.pojos.ArxivDocument; import org.apache.flink.api.java.tuple.Tuple2; import org.w3c.dom.Node; import javax.xml.xpath.XPathExpressionException; import java.util.ArrayList; import java.util.List; /** * Extraction of basic features required for Math-PD use cases. */ public class MathPdFeatureExtractor { /** * Gets * * @param n * @return */ private static String getNodeTextContent(Node n) { return n.getTextContent().trim(); } /** * returns a list consisting of all content-element bigrams (each node represented by its tag name), i.e., always one leaf node that is a content element and its direct parent node * example: * e=mc^2 --> bigrams: e =; = m; m \times; c ^; ^ 2 * * @param document * @return */ public static List<Tuple2<String, String>> getBigramLeaves(ArxivDocument document) throws XPathExpressionException { final List<Tuple2<String, String>> bigramLeaves = new ArrayList<>(); for (Node curLeafNode : document.getCElementLeafNodes()) { bigramLeaves.add(new Tuple2<>(getNodeTextContent(curLeafNode), getNodeTextContent(curLeafNode.getParentNode()))); } return bigramLeaves; } }