/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
import org.apache.hadoop.conf.*;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.*;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.*;
import org.apache.hadoop.fs.Path;
import java.io.*;
import java.util.*;
import java.util.Map.Entry;
/* Parse content in a segment. */
public class ParseSegment extends Configured implements Tool,
Mapper<WritableComparable, Content, Text, ParseImpl>,
Reducer<Text, Writable, Text, Writable> {
public static final Log LOG = LogFactory.getLog(Parser.class);
private ScoringFilters scfilters;
public ParseSegment() {
this(null);
}
public ParseSegment(Configuration conf) {
super(conf);
}
public void configure(JobConf job) {
setConf(job);
this.scfilters = new ScoringFilters(job);
}
public void close() {}
private Text newKey = new Text();
public void map(WritableComparable key, Content content,
OutputCollector<Text, ParseImpl> output, Reporter reporter)
throws IOException {
// convert on the fly from old UTF8 keys
if (key instanceof UTF8) {
newKey.set(key.toString());
key = newKey;
}
int status =
Integer.parseInt(content.getMetadata().get(Nutch.FETCH_STATUS_KEY));
if (status != CrawlDatum.STATUS_FETCH_SUCCESS) {
// content not fetched successfully, skip document
LOG.debug("Skipping " + key + " as content is not fetched successfully");
return;
}
ParseResult parseResult = null;
try {
parseResult = new ParseUtil(getConf()).parse(content);
} catch (Exception e) {
LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
return;
}
for (Entry<Text, Parse> entry : parseResult) {
Text url = entry.getKey();
Parse parse = entry.getValue();
ParseStatus parseStatus = parse.getData().getStatus();
if (!parseStatus.isSuccess()) {
LOG.warn("Error parsing: " + key + ": " + parseStatus);
parse = parseStatus.getEmptyParse(getConf());
}
// pass segment name to parse data
parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY,
getConf().get(Nutch.SEGMENT_NAME_KEY));
// compute the new signature
byte[] signature =
SignatureFactory.getSignature(getConf()).calculate(content, parse);
parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY,
StringUtil.toHexString(signature));
try {
scfilters.passScoreAfterParsing(url, content, parse);
} catch (ScoringFilterException e) {
if (LOG.isWarnEnabled()) {
e.printStackTrace(LogUtil.getWarnStream(LOG));
LOG.warn("Error passing score: "+ url +": "+e.getMessage());
}
}
output.collect(url, new ParseImpl(new ParseText(parse.getText()),
parse.getData(), parse.isCanonical()));
}
}
public void reduce(Text key, Iterator<Writable> values,
OutputCollector<Text, Writable> output, Reporter reporter)
throws IOException {
output.collect(key, (Writable)values.next()); // collect first value
}
public void parse(Path segment) throws IOException {
if (LOG.isInfoEnabled()) {
LOG.info("Parse: starting");
LOG.info("Parse: segment: " + segment);
}
JobConf job = new NutchJob(getConf());
job.setJobName("parse " + segment);
FileInputFormat.addInputPath(job, new Path(segment, Content.DIR_NAME));
job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());
job.setInputFormat(SequenceFileInputFormat.class);
job.setMapperClass(ParseSegment.class);
job.setReducerClass(ParseSegment.class);
FileOutputFormat.setOutputPath(job, segment);
job.setOutputFormat(ParseOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(ParseImpl.class);
JobClient.runJob(job);
if (LOG.isInfoEnabled()) { LOG.info("Parse: done"); }
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(NutchConfiguration.create(), new ParseSegment(), args);
System.exit(res);
}
public int run(String[] args) throws Exception {
Path segment;
String usage = "Usage: ParseSegment segment";
if (args.length == 0) {
System.err.println(usage);
System.exit(-1);
}
segment = new Path(args[0]);
parse(segment);
return 0;
}
}