/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.mahout.text; import java.io.IOException; import java.util.HashSet; import java.util.Locale; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringEscapeUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.DefaultStringifier; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.util.GenericsUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Maps over Wikipedia xml format and output all document having the category listed in the input category * file * */ public class WikipediaMapper extends Mapper<LongWritable, Text, Text, Text> { private static final Logger log = LoggerFactory.getLogger(WikipediaMapper.class); private static final Pattern SPACE_NON_ALPHA_PATTERN = Pattern.compile("[\\s]"); private static final String START_DOC = "<text xml:space=\"preserve\">"; private static final String END_DOC = "</text>"; private static final Pattern TITLE = Pattern.compile("<title>(.*)<\\/title>"); private static final String REDIRECT = "<redirect />"; private Set<String> inputCategories; private boolean exactMatchOnly; private boolean all; @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String content = value.toString(); if (content.contains(REDIRECT)) { return; } String document; String title; try { document = getDocument(content); title = getTitle(content); } catch (RuntimeException e) { // TODO: reporter.getCounter("Wikipedia", "Parse errors").increment(1); return; } if (!all) { String catMatch = findMatchingCategory(document); if ("Unknown".equals(catMatch)) { return; } } document = StringEscapeUtils.unescapeHtml(document); context.write(new Text(SPACE_NON_ALPHA_PATTERN.matcher(title).replaceAll("_")), new Text(document)); } @Override protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Configuration conf = context.getConfiguration(); if (inputCategories == null) { Set<String> newCategories = new HashSet<String>(); DefaultStringifier<Set<String>> setStringifier = new DefaultStringifier<Set<String>>(conf, GenericsUtil.getClass(newCategories)); String categoriesStr = conf.get("wikipedia.categories", setStringifier.toString(newCategories)); inputCategories = setStringifier.fromString(categoriesStr); } exactMatchOnly = conf.getBoolean("exact.match.only", false); all = conf.getBoolean("all.files", true); log.info("Configure: Input Categories size: {} All: {} Exact Match: {}", new Object[] {inputCategories.size(), all, exactMatchOnly}); } private static String getDocument(String xml) { int start = xml.indexOf(START_DOC) + START_DOC.length(); int end = xml.indexOf(END_DOC, start); return xml.substring(start, end); } private static String getTitle(CharSequence xml) { Matcher m = TITLE.matcher(xml); return m.find() ? m.group(1) : ""; } private String findMatchingCategory(String document) { int startIndex = 0; int categoryIndex; while ((categoryIndex = document.indexOf("[[Category:", startIndex)) != -1) { categoryIndex += 11; int endIndex = document.indexOf("]]", categoryIndex); if (endIndex >= document.length() || endIndex < 0) { break; } String category = document.substring(categoryIndex, endIndex).toLowerCase(Locale.ENGLISH).trim(); if (exactMatchOnly && inputCategories.contains(category)) { return category; } if (!exactMatchOnly) { for (String inputCategory : inputCategories) { if (category.contains(inputCategory)) { // we have an inexact match return inputCategory; } } } startIndex = endIndex; } return "Unknown"; } }