/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.tools.arc; import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Map.Entry; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configured; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapred.FileInputFormat; import org.apache.hadoop.mapred.FileOutputFormat; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.Mapper; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.crawl.CrawlDatum; import org.apache.nutch.crawl.NutchWritable; import org.apache.nutch.crawl.SignatureFactory; import org.apache.nutch.fetcher.FetcherOutputFormat; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.metadata.Nutch; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Parse; import org.apache.nutch.parse.ParseImpl; import org.apache.nutch.parse.ParseResult; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.parse.ParseText; import org.apache.nutch.parse.ParseUtil; import org.apache.nutch.protocol.Content; import org.apache.nutch.protocol.ProtocolStatus; import org.apache.nutch.scoring.ScoringFilters; import org.apache.nutch.util.LogUtil; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.StringUtil; /** * <p>The <code>ArcSegmentCreator</code> is a replacement for fetcher that will * take arc files as input and produce a nutch segment as output.</p> * * <p>Arc files are tars of compressed gzips which are produced by both the * internet archive project and the grub distributed crawler project.</p> * */ public class ArcSegmentCreator extends Configured implements Tool, Mapper<Text, BytesWritable, Text, NutchWritable> { public static final Log LOG = LogFactory.getLog(ArcSegmentCreator.class); public static final String URL_VERSION = "arc.url.version"; private JobConf jobConf; private URLFilters urlFilters; private ScoringFilters scfilters; private ParseUtil parseUtil; private URLNormalizers normalizers; private int interval; private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss"); public ArcSegmentCreator() { } /** * <p>Constructor that sets the job configuration.</p> * * @param conf */ public ArcSegmentCreator(Configuration conf) { setConf(conf); } /** * Generates a random name for the segments. * * @return The generated segment name. */ public static synchronized String generateSegmentName() { try { Thread.sleep(1000); } catch (Throwable t) { } return sdf.format(new Date(System.currentTimeMillis())); } /** * <p>Configures the job. Sets the url filters, scoring filters, url normalizers * and other relevant data.</p> * * @param job The job configuration. */ public void configure(JobConf job) { // set the url filters, scoring filters the parse util and the url // normalizers this.jobConf = job; this.urlFilters = new URLFilters(jobConf); this.scfilters = new ScoringFilters(jobConf); this.parseUtil = new ParseUtil(jobConf); this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_FETCHER); interval = jobConf.getInt("db.fetch.interval.default", 2592000); } public void close() { } /** * <p>Parses the raw content of a single record to create output. This method * is almost the same as the {@link org.apache.nutch.Fetcher#output} method in * terms of processing and output. * * @param output The job output collector. * @param segmentName The name of the segment to create. * @param key The url of the record. * @param datum The CrawlDatum of the record. * @param content The raw content of the record * @param pstatus The protocol status * @param status The fetch status. * * @return The result of the parse in a ParseStatus object. */ private ParseStatus output(OutputCollector<Text, NutchWritable> output, String segmentName, Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status) { // set the fetch status and the fetch time datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } try { // parse the content parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } // set the content signature if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(getConf()).calculate( content, new ParseStatus().getEmptyParse(getConf())); datum.setSignature(signature); } try { output.collect(key, new NutchWritable(datum)); output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry <Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(getConf()); } // Calculate page signature. byte[] signature = SignatureFactory.getSignature(getConf()).calculate( content, parse); // Ensure segment name and score are in parseData metadata parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parse.getData().getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { e.printStackTrace(LogUtil.getWarnStream(LOG)); LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } output.collect(url, new NutchWritable(new ParseImpl(new ParseText( parse.getText()), parse.getData(), parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isFatalEnabled()) { LOG.fatal("ArcSegmentCreator caught:" + StringUtils.stringifyException(e)); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { return p.getData().getStatus(); } } } return null; } /** * <p>Logs any error that occurs during conversion.</p> * * @param url The url we are parsing. * @param t The error that occured. */ private void logError(Text url, Throwable t) { if (LOG.isInfoEnabled()) { LOG.info("Conversion of " + url + " failed with: " + StringUtils.stringifyException(t)); } } /** * <p>Runs the Map job to translate an arc record into output for Nutch * segments.</p> * * @param key The arc record header. * @param bytes The arc record raw content bytes. * @param output The output collecter. * @param reporter The progress reporter. */ public void map(Text key, BytesWritable bytes, OutputCollector<Text, NutchWritable> output, Reporter reporter) throws IOException { String[] headers = key.toString().split("\\s+"); String urlStr = headers[0]; String version = headers[2]; String contentType = headers[3]; // arcs start with a file description. for now we ignore this as it is not // a content record if (urlStr.startsWith("filedesc://")) { LOG.info("Ignoring file header: " + urlStr); return; } LOG.info("Processing: " + urlStr); // get the raw bytes from the arc file, create a new crawldatum Text url = new Text(); CrawlDatum datum = new CrawlDatum(CrawlDatum.STATUS_DB_FETCHED, interval, 1.0f); String segmentName = getConf().get(Nutch.SEGMENT_NAME_KEY); // normalize and filter the urls try { urlStr = normalizers.normalize(urlStr, URLNormalizers.SCOPE_FETCHER); urlStr = urlFilters.filter(urlStr); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } urlStr = null; } // if still a good url then process if (urlStr != null) { url.set(urlStr); try { // set the protocol status to success and the crawl status to success // create the content from the normalized url and the raw bytes from // the arc file, TODO: currently this doesn't handle text of errors // pages (i.e. 404, etc.). We assume we won't get those. ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS; Content content = new Content(urlStr, urlStr, bytes.get(), contentType, new Metadata(), getConf()); // set the url version into the metadata content.getMetadata().set(URL_VERSION, version); ParseStatus pstatus = null; pstatus = output(output, segmentName, url, datum, content, status, CrawlDatum.STATUS_FETCH_SUCCESS); reporter.progress(); } catch (Throwable t) { // unexpected exception logError(url, t); output(output, segmentName, url, datum, null, null, CrawlDatum.STATUS_FETCH_RETRY); } } } /** * <p>Creates the arc files to segments job.</p> * * @param arcFiles The path to the directory holding the arc files * @param segmentsOutDir The output directory for writing the segments * * @throws IOException If an IO error occurs while running the job. */ public void createSegments(Path arcFiles, Path segmentsOutDir) throws IOException { if (LOG.isInfoEnabled()) { LOG.info("ArcSegmentCreator: starting"); LOG.info("ArcSegmentCreator: arc files dir: " + arcFiles); } JobConf job = new NutchJob(getConf()); job.setJobName("ArcSegmentCreator " + arcFiles); String segName = generateSegmentName(); job.set(Nutch.SEGMENT_NAME_KEY, segName); FileInputFormat.addInputPath(job, arcFiles); job.setInputFormat(ArcInputFormat.class); job.setMapperClass(ArcSegmentCreator.class); FileOutputFormat.setOutputPath(job, new Path(segmentsOutDir, segName)); job.setOutputFormat(FetcherOutputFormat.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NutchWritable.class); JobClient.runJob(job); if (LOG.isInfoEnabled()) { LOG.info("ArcSegmentCreator: done"); } } public static void main(String args[]) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ArcSegmentCreator(), args); System.exit(res); } public int run(String[] args) throws Exception { String usage = "Usage: ArcSegmentCreator <arcFiles> <segmentsOutDir>"; if (args.length < 2) { System.err.println(usage); return -1; } // set the arc files directory and the segments output directory Path arcFiles = new Path(args[0]); Path segmentsOutDir = new Path(args[1]); try { // create the segments from the arc files createSegments(arcFiles, segmentsOutDir); return 0; } catch (Exception e) { LOG.fatal("ArcSegmentCreator: " + StringUtils.stringifyException(e)); return -1; } } }