package io.lumify.wikipedia.mapreduce; import com.altamiracorp.bigtable.model.accumulo.AccumuloSession; import com.google.inject.Inject; import io.lumify.core.config.Configuration; import io.lumify.core.config.HashMapConfigurationLoader; import io.lumify.core.mapreduce.LumifyElementMapperBase; import io.lumify.core.model.audit.Audit; import io.lumify.core.model.audit.AuditAction; import io.lumify.core.model.properties.LumifyProperties; import io.lumify.core.model.termMention.TermMentionBuilder; import io.lumify.core.model.user.UserRepository; import io.lumify.core.security.DirectVisibilityTranslator; import io.lumify.core.security.VisibilityTranslator; import io.lumify.core.user.SystemUser; import io.lumify.core.user.User; import io.lumify.core.util.LumifyLogger; import io.lumify.core.util.LumifyLoggerFactory; import io.lumify.core.version.VersionService; import io.lumify.securegraph.model.audit.SecureGraphAuditRepository; import io.lumify.web.clientapi.model.VisibilityJson; import io.lumify.wikipedia.*; import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Counter; import org.jdom2.Document; import org.jdom2.JDOMException; import org.jdom2.filter.Filters; import org.jdom2.input.SAXBuilder; import org.jdom2.xpath.XPathExpression; import org.jdom2.xpath.XPathFactory; import org.securegraph.*; import org.securegraph.accumulo.AccumuloAuthorizations; import org.securegraph.accumulo.mapreduce.SecureGraphMRUtils; import org.securegraph.property.StreamingPropertyValue; import org.securegraph.util.ConvertingIterable; import org.securegraph.util.JoinIterable; import org.sweble.wikitext.engine.EngineException; import org.sweble.wikitext.engine.PageId; import org.sweble.wikitext.engine.PageTitle; import org.sweble.wikitext.engine.WtEngineImpl; import org.sweble.wikitext.engine.config.WikiConfigImpl; import org.sweble.wikitext.engine.nodes.EngProcessedPage; import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; import org.sweble.wikitext.parser.parser.LinkTargetException; import java.io.ByteArrayInputStream; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.List; import java.util.Map; class ImportMRMapper extends LumifyElementMapperBase<LongWritable, Text> { private static final LumifyLogger LOGGER = LumifyLoggerFactory.getLogger(ImportMRMapper.class); public static final String TEXT_XPATH = "/page/revision/text/text()"; public static final String TITLE_XPATH = "/page/title/text()"; public static final String REVISION_TIMESTAMP_XPATH = "/page/revision/timestamp/text()"; public static final SimpleDateFormat ISO8601DATEFORMAT = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss"); public static final String CONFIG_SOURCE_FILE_NAME = "sourceFileName"; private static final String WIKIPEDIA_PROCESS = ImportMR.class.getName(); private XPathExpression<org.jdom2.Text> textXPath; private XPathExpression<org.jdom2.Text> titleXPath; private XPathExpression<org.jdom2.Text> revisionTimestampXPath; private Visibility visibility; private Authorizations authorizations; private WikiConfigImpl config; private WtEngineImpl compiler; private User user; private SecureGraphAuditRepository auditRepository; private UserRepository userRepository; private String sourceFileName; private Counter pagesProcessedCounter; private Text auditTableNameText; private Counter pagesSkippedCounter; private VisibilityJson visibilityJson; private VisibilityTranslator visibilityTranslator; private Visibility defaultVisibility; public ImportMRMapper() { this.textXPath = XPathFactory.instance().compile(TEXT_XPATH, Filters.text()); this.titleXPath = XPathFactory.instance().compile(TITLE_XPATH, Filters.text()); this.revisionTimestampXPath = XPathFactory.instance().compile(REVISION_TIMESTAMP_XPATH, Filters.text()); } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Map configurationMap = SecureGraphMRUtils.toMap(context.getConfiguration()); this.visibilityTranslator = new DirectVisibilityTranslator(); this.visibility = this.visibilityTranslator.getDefaultVisibility(); this.defaultVisibility = this.visibilityTranslator.getDefaultVisibility(); this.visibilityJson = new VisibilityJson(); this.authorizations = new AccumuloAuthorizations(); this.user = new SystemUser(null); VersionService versionService = new VersionService(); Configuration configuration = new HashMapConfigurationLoader(configurationMap).createConfiguration(); this.auditRepository = new SecureGraphAuditRepository(null, versionService, configuration, null, userRepository); this.sourceFileName = context.getConfiguration().get(CONFIG_SOURCE_FILE_NAME); try { config = DefaultConfigEnWp.generate(); compiler = new WtEngineImpl(config); } catch (Exception ex) { throw new IOException("Could not configure sweble", ex); } pagesProcessedCounter = context.getCounter(WikipediaImportCounters.PAGES_PROCESSED); pagesSkippedCounter = context.getCounter(WikipediaImportCounters.PAGES_SKIPPED); auditTableNameText = new Text(Audit.TABLE_NAME); } @Override protected void safeMap(LongWritable filePosition, Text line, Context context) throws IOException, InterruptedException { ParsePage parsePage; TextConverter textConverter = new TextConverter(config); String pageString = line.toString().replaceAll("\\\\n", "\n"); try { parsePage = new ParsePage(pageString).invoke(); } catch (JDOMException e) { LOGGER.error("Could not parse XML: " + filePosition + ":\n" + pageString, e); context.getCounter(WikipediaImportCounters.XML_PARSE_ERRORS).increment(1); return; } context.progress(); if (shouldSkip(parsePage)) { pagesSkippedCounter.increment(1); return; } String wikipediaPageVertexId = WikipediaConstants.getWikipediaPageVertexId(parsePage.getPageTitle()); context.setStatus(wikipediaPageVertexId); try { String wikitext = getPageText(parsePage.getWikitext(), wikipediaPageVertexId, textConverter); parsePage.setWikitext(wikitext); } catch (Exception ex) { LOGGER.error("Could not process wikipedia text: " + filePosition + ":\n" + parsePage.getWikitext(), ex); context.getCounter(WikipediaImportCounters.WIKI_TEXT_PARSE_ERRORS).increment(1); return; } context.progress(); String multiKey = ImportMR.MULTI_VALUE_KEY + '#' + parsePage.getPageTitle(); Vertex pageVertex = savePage(context, wikipediaPageVertexId, parsePage, pageString, multiKey); context.progress(); savePageLinks(context, pageVertex, textConverter, multiKey); pagesProcessedCounter.increment(1); } private boolean shouldSkip(ParsePage parsePage) { String lowerCaseTitle = parsePage.getPageTitle().toLowerCase(); if (lowerCaseTitle.startsWith("wikipedia:")) { return true; } return false; } private Vertex savePage(Context context, String wikipediaPageVertexId, ParsePage parsePage, String pageString, String multiKey) throws IOException, InterruptedException { boolean isRedirect = parsePage.getWikitext().startsWith("REDIRECT:"); StreamingPropertyValue rawPropertyValue = new StreamingPropertyValue(new ByteArrayInputStream(pageString.getBytes()), byte[].class); rawPropertyValue.store(true); rawPropertyValue.searchIndex(false); StreamingPropertyValue textPropertyValue = new StreamingPropertyValue(new ByteArrayInputStream(parsePage.getWikitext().getBytes()), String.class); VertexBuilder pageVertexBuilder = prepareVertex(wikipediaPageVertexId, visibility); LumifyProperties.CONCEPT_TYPE.setProperty(pageVertexBuilder, WikipediaConstants.WIKIPEDIA_PAGE_CONCEPT_URI, visibility); LumifyProperties.MIME_TYPE.setProperty(pageVertexBuilder, ImportMR.WIKIPEDIA_MIME_TYPE, visibility); LumifyProperties.FILE_NAME.setProperty(pageVertexBuilder, sourceFileName, visibility); LumifyProperties.SOURCE.setProperty(pageVertexBuilder, WikipediaConstants.WIKIPEDIA_SOURCE, visibility); Metadata rawMetadata = new Metadata(); LumifyProperties.CONFIDENCE.setMetadata(rawMetadata, isRedirect ? 0.3 : 0.4, defaultVisibility); LumifyProperties.RAW.addPropertyValue(pageVertexBuilder, multiKey, rawPropertyValue, rawMetadata, visibility); Metadata titleMetadata = new Metadata(); LumifyProperties.CONFIDENCE.setMetadata(titleMetadata, isRedirect ? 0.3 : 0.4, defaultVisibility); LumifyProperties.TITLE.addPropertyValue(pageVertexBuilder, multiKey, parsePage.getPageTitle(), titleMetadata, visibility); Metadata sourceUrlMetadata = new Metadata(); LumifyProperties.CONFIDENCE.setMetadata(sourceUrlMetadata, isRedirect ? 0.3 : 0.4, defaultVisibility); LumifyProperties.SOURCE_URL.addPropertyValue(pageVertexBuilder, multiKey, parsePage.getSourceUrl(), sourceUrlMetadata, visibility); if (parsePage.getRevisionTimestamp() != null) { Metadata publishedDateMetadata = new Metadata(); LumifyProperties.CONFIDENCE.setMetadata(publishedDateMetadata, isRedirect ? 0.3 : 0.4, defaultVisibility); LumifyProperties.PUBLISHED_DATE.addPropertyValue(pageVertexBuilder, multiKey, parsePage.getRevisionTimestamp(), publishedDateMetadata, visibility); } if (!isRedirect) { Metadata textMetadata = new Metadata(); LumifyProperties.META_DATA_TEXT_DESCRIPTION.setMetadata(textMetadata, "Text", defaultVisibility); LumifyProperties.TEXT.addPropertyValue(pageVertexBuilder, multiKey, textPropertyValue, textMetadata, visibility); } Vertex pageVertex = pageVertexBuilder.save(authorizations); // audit vertex Audit audit = auditRepository.createAudit(AuditAction.CREATE, pageVertex.getId(), "Wikipedia MR", "", user, visibility); context.write(auditTableNameText, AccumuloSession.createMutationFromRow(audit)); // because save above will cause the StreamingPropertyValue to be read we need to reset the position to 0 for search indexing rawPropertyValue.getInputStream().reset(); textPropertyValue.getInputStream().reset(); return pageVertex; } private String getPageText(String wikiText, String wikipediaPageVertexId, TextConverter textConverter) throws LinkTargetException, EngineException { String fileTitle = wikipediaPageVertexId; PageId pageId = new PageId(PageTitle.make(config, fileTitle), -1); EngProcessedPage compiledPage = compiler.postprocess(pageId, wikiText, null); String text = (String) textConverter.go(compiledPage.getPage()); if (text.length() > 0) { wikiText = text; } return wikiText; } private void savePageLinks(Context context, Vertex pageVertex, TextConverter textConverter, String pageTextKey) throws IOException, InterruptedException { for (LinkWithOffsets link : getLinks(textConverter)) { savePageLink(context, pageVertex, link, pageTextKey); context.progress(); } } private void savePageLink(Context context, Vertex pageVertex, LinkWithOffsets link, String pageTextKey) throws IOException, InterruptedException { String linkTarget = link.getLinkTargetWithoutHash(); String linkVertexId = WikipediaConstants.getWikipediaPageVertexId(linkTarget); context.setStatus(pageVertex.getId() + " [" + linkVertexId + "]"); VertexBuilder linkedPageVertexBuilder = prepareVertex(linkVertexId, visibility); LumifyProperties.CONCEPT_TYPE.setProperty(linkedPageVertexBuilder, WikipediaConstants.WIKIPEDIA_PAGE_CONCEPT_URI, visibility); LumifyProperties.MIME_TYPE.setProperty(linkedPageVertexBuilder, ImportMR.WIKIPEDIA_MIME_TYPE, visibility); LumifyProperties.SOURCE.setProperty(linkedPageVertexBuilder, WikipediaConstants.WIKIPEDIA_SOURCE, visibility); LumifyProperties.FILE_NAME.setProperty(linkedPageVertexBuilder, sourceFileName, visibility); Metadata titleMetadata = new Metadata(); LumifyProperties.CONFIDENCE.setMetadata(titleMetadata, 0.1, defaultVisibility); String linkTargetHash = Base64.encodeBase64String(linkTarget.trim().toLowerCase().getBytes()); LumifyProperties.TITLE.addPropertyValue(linkedPageVertexBuilder, ImportMR.MULTI_VALUE_KEY + "#" + linkTargetHash, linkTarget, titleMetadata, visibility); Vertex linkedPageVertex = linkedPageVertexBuilder.save(authorizations); Edge edge = addEdge(WikipediaConstants.getWikipediaPageToPageEdgeId(pageVertex, linkedPageVertex), pageVertex, linkedPageVertex, WikipediaConstants.WIKIPEDIA_PAGE_INTERNAL_LINK_WIKIPEDIA_PAGE_CONCEPT_URI, visibility, authorizations); new TermMentionBuilder() .sourceVertex(pageVertex) .propertyKey(pageTextKey) .start(link.getStartOffset()) .end(link.getEndOffset()) .title(linkTarget) .conceptIri(WikipediaConstants.WIKIPEDIA_PAGE_CONCEPT_URI) .visibilityJson(visibilityJson) .process(WIKIPEDIA_PROCESS) .resolvedTo(linkedPageVertex, edge) .save(getGraph(), visibilityTranslator, authorizations); } private Iterable<LinkWithOffsets> getLinks(TextConverter textConverter) { return new JoinIterable<>( new ConvertingIterable<InternalLinkWithOffsets, LinkWithOffsets>(textConverter.getInternalLinks()) { @Override protected LinkWithOffsets convert(InternalLinkWithOffsets internalLinkWithOffsets) { return internalLinkWithOffsets; } }, new ConvertingIterable<RedirectWithOffsets, LinkWithOffsets>(textConverter.getRedirects()) { @Override protected LinkWithOffsets convert(RedirectWithOffsets redirectWithOffsets) { return redirectWithOffsets; } } ); } @Inject public void setUserRepository(UserRepository userRepository) { this.userRepository = userRepository; } private class ParsePage { private String pageString; private String wikitext; private String pageTitle; private String sourceUrl; private Date revisionTimestamp; public ParsePage(String pageString) { this.pageString = pageString; } public String getWikitext() { return wikitext; } public String getPageTitle() { return pageTitle; } public String getSourceUrl() { return sourceUrl; } public Date getRevisionTimestamp() { return revisionTimestamp; } public ParsePage invoke() throws JDOMException, IOException { SAXBuilder builder = new SAXBuilder(); Document doc = builder.build(new ByteArrayInputStream(pageString.getBytes())); pageTitle = textToString(titleXPath.evaluateFirst(doc)); wikitext = textToString(textXPath.evaluate(doc)); sourceUrl = "http://en.wikipedia.org/wiki/" + pageTitle; String revisionTimestampString = textToString(revisionTimestampXPath.evaluateFirst(doc)); revisionTimestamp = null; try { revisionTimestamp = ISO8601DATEFORMAT.parse(revisionTimestampString); } catch (Exception ex) { LOGGER.error("Could not parse revision timestamp %s", revisionTimestampString, ex); } return this; } private String textToString(List<org.jdom2.Text> texts) { StringBuilder sb = new StringBuilder(); for (org.jdom2.Text t : texts) { sb.append(textToString(t)); } return sb.toString(); } private String textToString(org.jdom2.Text text) { if (text == null) { return ""; } return text.getText(); } public void setWikitext(String wikitext) { this.wikitext = wikitext; } } }