package com.formulasearchengine.mathosphere.mathpd.contracts;
import com.formulasearchengine.mathmltools.xmlhelper.NonWhitespaceNodeList;
import com.formulasearchengine.mathosphere.mathpd.Distances;
import com.formulasearchengine.mathosphere.mathpd.pojos.ArxivDocument;
import com.formulasearchengine.mathosphere.mathpd.pojos.ExtractedMathPDDocument;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.xpath.XPathExpressionException;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TextExtractorMapper implements FlatMapFunction<String, Tuple2<String, ExtractedMathPDDocument>> {
private static final Logger LOGGER = LoggerFactory.getLogger(TextExtractorMapper.class);
private static final Pattern FILENAME_PATTERN_20PD = Pattern.compile("<ARXIVFILESPLIT(?:\\\\n?|[\\s\\r\\n]+)" +
"Filename=\"(.*?).xhtml\">(?:\\s*)(.*)", Pattern.DOTALL);
private static final Pattern FILENAME_PATTERN_NTCIR = Pattern.compile("<ARXIVFILESPLIT(?:\\\\n?|[\\s\\r\\n]+)" +
"Filename=\"\\./.+/(.*?)/\\1_(\\d+)_(\\d+)\\.xhtml\">(?:\\s*)(.*)", Pattern.DOTALL);
// private static final Pattern FILENAME_PATTERN_NTCIR = Pattern.compile("<ARXIVFILESPLIT(?:\\\\n?|[\\s\\r\\n]+)" +
// "Filename=\"\\./\\d+/(.*?)/\\1_(\\d+)_(\\d+)\\.xhtml\">(?:\\s*)(.*)", Pattern.DOTALL);
private final Pattern filenamePattern;
private final boolean isNtcir;
public TextExtractorMapper(boolean isNtcir) {
if (isNtcir) {
filenamePattern = FILENAME_PATTERN_NTCIR;
} else {
filenamePattern = FILENAME_PATTERN_20PD;
}
this.isNtcir = isNtcir;
}
public static ExtractedMathPDDocument convertArxivToExtractedMathPDDocument(ArxivDocument document) throws ParserConfigurationException, IOException, XPathExpressionException, TransformerException {
if (document == null) {
LOGGER.warn("ArxivDocument = null");
return null;
}
// try {
final ExtractedMathPDDocument extractedMathPDDocument = new ExtractedMathPDDocument(document.title, document.text);
extractedMathPDDocument.setName(document.getName());
extractedMathPDDocument.setPage(document.getPage());
// discard this document if no math tag is contained
NonWhitespaceNodeList mathTags = null;
try {
mathTags = document.getMathTags();
if (mathTags.getLength() == 0) {
LOGGER.info("{} contains no math tags", document.getName());
return null;
}
} catch (XPathExpressionException xPathExpressionException) {
LOGGER.error("following string could not be converted to xpath: {}", document.text);
return null;
}
// extract all features we are or might be interested in later
extractedMathPDDocument.setHistogramCn(Distances.getDocumentHistogram(document, "cn", mathTags));
extractedMathPDDocument.setHistogramCi(Distances.getDocumentHistogram(document, "ci", mathTags));
extractedMathPDDocument.setHistogramCsymbol(Distances.getDocumentHistogram(document, "ci", mathTags));
extractedMathPDDocument.setHistogramBvar(Distances.getDocumentHistogram(document, "bvar", mathTags));
return extractedMathPDDocument;
// } catch (Exception e) {
// LOGGER.error(e.getClass().toString());
// LOGGER.error(e.toString());
// return null;
//}
}
public static void main(String[] args) {
TextExtractorMapper t = new TextExtractorMapper(true);
t.getTitleAndTextualContent("<ARXIVFILESPLIT Filename=\"./xhtml5/1/0704.0097/0704.0097_1_11.xhtml\">\n" +
"<?xml version=\"1.0\" encoding=\"utf-8\"?>\n" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
"<head>");
}
public Tuple4<String, String, String, String> getTitleAndTextualContent(String content) {
if (content.startsWith("\n"))
content = content.substring(1);
Matcher titleMatcher = filenamePattern.matcher(content);
if (!titleMatcher.find()) {
LOGGER.error("found no title");
return null;
}
String title;
String xhtml;
String name = "no-name";
String page = "-1";
if (isNtcir) {
name = titleMatcher.group(1);
page = titleMatcher.group(3);
xhtml = titleMatcher.group(4);
} else {
title = titleMatcher.group(1);
xhtml = titleMatcher.group(2);
final String[] titleComponents = title.split("/");
// tailored to the input format, you might need to change this if you have another format
switch (titleComponents.length) {
case 5:
page = titleComponents[4];
case 4:
name = titleComponents[3];
break;
default:
throw new RuntimeException("title does not contain all components: " + title);
}
}
title = name + "/" + page;
//LOGGER.warn(name);
//LOGGER.warn(page);
return new Tuple4<>(title, name, page, xhtml);
}
public ArxivDocument arxivTextToDocument(String content) {
final Tuple4<String, String, String, String> titleAndContent = getTitleAndTextualContent(content);
if (titleAndContent == null) {
return null;
}
final ArxivDocument arxivDocument = new ArxivDocument(titleAndContent.f0, titleAndContent.f3);
arxivDocument.setName(titleAndContent.f1);
arxivDocument.setPage(titleAndContent.f2);
return arxivDocument;
}
@Override
public void flatMap(String content, Collector<Tuple2<String, ExtractedMathPDDocument>> out) throws ParserConfigurationException, TransformerException, XPathExpressionException, IOException {
final ArxivDocument document = arxivTextToDocument(content);
if (document == null) {
LOGGER.trace("could not convert raw string to ArxivDocuemt: {}", content.substring(0, content.length() > 100 ? 100 : content.length() - 1));
return;
}
LOGGER.info("processing document '{}'...", document.title);
final ExtractedMathPDDocument extractedMathPDDocument = convertArxivToExtractedMathPDDocument(document);
if (extractedMathPDDocument == null) {
LOGGER.info("could not convert ArxivDocument to ExtractedMathPDDocument: {}", document.title);
return;
}
// store the doc in the collector
LOGGER.info("finished processing document '{}'...", document.title);
out.collect(new Tuple2<>(extractedMathPDDocument.getName(), extractedMathPDDocument));
}
}