package com.formulasearchengine.mathosphere.mlp.contracts;
import com.formulasearchengine.mathosphere.mlp.pojos.RawWikiDocument;
import org.apache.commons.lang3.text.translate.AggregateTranslator;
import org.apache.commons.lang3.text.translate.CharSequenceTranslator;
import org.apache.commons.lang3.text.translate.EntityArrays;
import org.apache.commons.lang3.text.translate.LookupTranslator;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.util.Collector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TextExtractorMapper implements FlatMapFunction<String, RawWikiDocument> {
private static final Logger LOGGER = LoggerFactory.getLogger(TextExtractorMapper.class);
private static final Pattern TITLE_PATTERN = Pattern.compile("(?:<title>)(.*?)(?:</title>)");
private static final Pattern NAMESPACE_PATTERN = Pattern.compile("(?:<ns>)(.*?)(?:</ns>)");
private static final Pattern TEXT_PATTERN = Pattern.compile("(?:<text.*?>)(.*?)(?:</text>)",
Pattern.DOTALL);
private static final CharSequenceTranslator TRANSLATOR = new AggregateTranslator(
new LookupTranslator(EntityArrays.ISO8859_1_UNESCAPE()),
new LookupTranslator(EntityArrays.BASIC_UNESCAPE()),
new LookupTranslator(EntityArrays.HTML40_EXTENDED_UNESCAPE()));
@Override
public void flatMap(String content, Collector<RawWikiDocument> out) throws Exception {
Matcher titleMatcher = TITLE_PATTERN.matcher(content);
if (!titleMatcher.find()) {
return;
}
String title = titleMatcher.group(1);
LOGGER.info("processing document '{}'...", title);
Matcher namespaceMatcher = NAMESPACE_PATTERN.matcher(content);
if (!namespaceMatcher.find()) {
return;
}
int ns = Integer.parseInt(namespaceMatcher.group(1));
if (ns != 0) {
// skip docs from namespaces other than 0
return;
}
// parse text
Matcher textMatcher = TEXT_PATTERN.matcher(content);
if (!textMatcher.find()) {
return;
}
String rawText = textMatcher.group(1);
String text = unescape(rawText);
out.collect(new RawWikiDocument(title, ns, text));
}
/**
* Unescapes special entity char sequences like < to its UTF-8 representation. All ISO-8859-1,
* HTML4 and Basic entities will be translated.
*
* @param text the text that will be unescaped
* @return the unescaped version of the string text
*/
public static String unescape(String text) {
return TRANSLATOR.translate(text);
}
}