package com.formulasearchengine.mathosphere.mathpd;
import com.formulasearchengine.mathosphere.TestUtils;
import com.formulasearchengine.mathosphere.mathpd.contracts.TextExtractorMapper;
import com.formulasearchengine.mathosphere.mathpd.pojos.ArxivDocument;
import com.formulasearchengine.mathosphere.mathpd.pojos.ExtractedMathPDDocument;
import com.google.common.base.Throwables;
import org.apache.flink.api.java.tuple.Tuple4;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import static junit.framework.TestCase.assertTrue;
/**
* Created by Felix Hamborg <felixhamborg@gmail.com> on 09.12.2016.
* Unit test cases for features and distances.
*/
public class FeaturesAndDistancesTest {
private static final Logger LOGGER = LoggerFactory.getLogger(TextExtractorMapper.class);
private static String decodePath(String urlEncodedPath) {
try {
return URLDecoder.decode(urlEncodedPath, "UTF-8");
} catch (UnsupportedEncodingException e) {
throw Throwables.propagate(e);
}
}
private static ExtractedMathPDDocument testResourceToExtractedMathPDDocument(String path) throws IOException, TransformerException, XPathExpressionException, ParserConfigurationException {
final ArxivDocument arxivDocument = new TextExtractorMapper(false).arxivTextToDocument(TestUtils.getFileContents(path));
return TextExtractorMapper.convertArxivToExtractedMathPDDocument(arxivDocument);
}
private String resourcePath(String resourceName) {
ClassLoader classLoader = getClass().getClassLoader();
URL resource = classLoader.getResource(resourceName);
return decodePath(resource.getFile());
}
@Test
public void testDistanceSameFile() throws Exception {
final String resourceSimple = "com/formulasearchengine/mathosphere/mathpd/simple.xhtml";
ExtractedMathPDDocument doc1 = testResourceToExtractedMathPDDocument(resourceSimple);
ExtractedMathPDDocument doc2 = testResourceToExtractedMathPDDocument(resourceSimple);
assertTrue(doc1 != null && doc2 != null);
final Tuple4<Double, Double, Double, Double> distanceAbsoluteAllFeatures = Distances.distanceAbsoluteAllFeatures(doc1, doc2);
LOGGER.debug("absolute distance = " + distanceAbsoluteAllFeatures);
final Tuple4<Double, Double, Double, Double> distanceRelativeAllFeatures = Distances.distanceRelativeAllFeatures(doc1, doc2);
LOGGER.debug("relative distance = " + distanceRelativeAllFeatures);
final double distanceEarthMoverAllFeatures = Distances.computeEarthMoverAbsoluteDistance(doc1.getHistogramCi(), doc2.getHistogramCi());
LOGGER.debug("earth mover distance = " + distanceEarthMoverAllFeatures);
assertTrue(distanceAbsoluteAllFeatures.f0
+ distanceAbsoluteAllFeatures.f1
+ distanceAbsoluteAllFeatures.f2
+ distanceAbsoluteAllFeatures.f3
+ distanceRelativeAllFeatures.f0
+ distanceRelativeAllFeatures.f1
+ distanceRelativeAllFeatures.f2
+ distanceRelativeAllFeatures.f3
+ distanceEarthMoverAllFeatures == 0.0);
}
@Test
public void testEarthMoverDistance() throws ParserConfigurationException, TransformerException, XPathExpressionException, IOException {
final String resourceSimple = "com/formulasearchengine/mathosphere/mathpd/simple.xhtml";
ExtractedMathPDDocument doc1 = testResourceToExtractedMathPDDocument(resourceSimple);
ExtractedMathPDDocument doc2 = testResourceToExtractedMathPDDocument(resourceSimple);
final double distanceAbsoluteEarthMoverCiSame = Distances.computeEarthMoverAbsoluteDistance(doc1.getHistogramCi(), doc2.getHistogramCi());
LOGGER.debug("earthmover absolute distance = " + distanceAbsoluteEarthMoverCiSame);
assertTrue(distanceAbsoluteEarthMoverCiSame == 0.0);
}
@Test
public void testHistogramExtractionAndAbsoluteDistance() throws ParserConfigurationException, TransformerException, XPathExpressionException, IOException {
/*
final String resourceSimple = "com/formulasearchengine/mathosphere/mathpd/simple.xhtml";
ExtractedMathPDDocument document = testResourceToExtractedMathPDDocument(resourceSimple);
// bound variables
assertTrue(document.getHistogramBvar().size() == 0.0);
// identifiers
HashMap<String, Double> histogramCi = new HashMap<>();
histogramCi.put("\uD835\uDC4E", 1.0);
histogramCi.put("\uD835\uDC4F", 1.0);
histogramCi.put("\uD835\uDC50", 1.0);
histogramCi.put("\uD835\uDC51", 1.0);
assertTrue(Distances.computeAbsoluteDistance(document.getHistogramCi(), histogramCi) == 0.0);
// numbers
HashMap<String, Double> histogramCn = new HashMap<>();
histogramCn.put("1", 1.0);
histogramCn.put("2", 1.0);
histogramCn.put("3", 1.0);
histogramCn.put("4", 1.0);
assertTrue(Distances.computeAbsoluteDistance(document.getHistogramCn(), histogramCn) == 0.0);
// symbols
HashMap<String, Double> histogramCsymbol = new HashMap<>();
histogramCsymbol.put("minus", 1.0);
histogramCsymbol.put("plus", 3.0);
histogramCsymbol.put("times", 1.0);
histogramCsymbol.put("divide", 1.0);
histogramCsymbol.put("eq", 2.0);
histogramCsymbol.put("list", 1.0);
assertTrue(Distances.computeAbsoluteDistance(document.getHistogramCsymbol(), histogramCsymbol) == 0.0);
*/
}
@Test
public void testCosineSimilarity() {
HashMap<String, Double> h1 = new HashMap<>();
h1.put("minus", 1.0);
h1.put("plus", 3.0);
h1.put("times", 1.0);
h1.put("divide", 1.0);
h1.put("eq", 2.0);
h1.put("list", 1.0);
HashMap<String, Double> h2 = new HashMap<>();
h2.put("minus", 1.0);
h2.put("plus", 3.0);
h2.put("times", 1.0);
h2.put("divide", 1.0);
h2.put("eq", 2.0);
h2.put("list", 1.0);
h2 = Distances.histogramsPlus(h2, h2);
System.out.println(Distances.computeCosineDistance(h1, h2));
}
@Test
public void testEarthMoverDistanceBasic() {
HashMap<String, Double> histogramCi1 = new HashMap<>();
histogramCi1.put("a", 3.0);
histogramCi1.put("b", 1.0);
histogramCi1.put("c", 1.0);
HashMap<String, Double> histogramCi2 = new HashMap<>();
histogramCi2.put("a", 1.0);
histogramCi2.put("b", 1.0);
histogramCi2.put("c", 3.0);
final double distance = Distances.computeEarthMoverAbsoluteDistance(histogramCi1, histogramCi2);
LOGGER.debug("earth mover distance = " + distance);
}
}