package org.apache.nutchbase.parse; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hbase.io.BatchUpdate; import org.apache.hadoop.hbase.io.ImmutableBytesWritable; import org.apache.hadoop.hbase.io.RowResult; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.mapred.JobClient; import org.apache.hadoop.mapred.JobConf; import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.Tool; import org.apache.hadoop.util.ToolRunner; import org.apache.nutch.fetcher.Fetcher; import org.apache.nutch.net.URLFilterException; import org.apache.nutch.net.URLFilters; import org.apache.nutch.net.URLNormalizers; import org.apache.nutch.parse.Outlink; import org.apache.nutch.parse.ParseStatus; import org.apache.nutch.util.NutchConfiguration; import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.URLUtil; import org.apache.nutchbase.crawl.CrawlDatumHbase; import org.apache.nutchbase.crawl.SignatureFactoryHbase; import org.apache.nutchbase.crawl.SignatureHbase; import org.apache.nutchbase.fetcher.FetcherHbase; import org.apache.nutchbase.util.hbase.RowPart; import org.apache.nutchbase.util.hbase.TableColumns; import org.apache.nutchbase.util.hbase.TableMapReduce; import org.apache.nutchbase.util.hbase.TableUtil; public class ParseTable extends TableMapReduce<ImmutableBytesWritable, RowPart> implements Tool { public static final Log LOG = LogFactory.getLog(ParseTable.class); public static final String TMP_UPDATE_MARK = "__tmp_update_mark__"; private static final Set<String> COLUMNS = new HashSet<String>(); static { COLUMNS.add(TableColumns.STATUS_STR); COLUMNS.add(TableColumns.CONTENT_STR); COLUMNS.add(TableColumns.CONTENT_TYPE_STR); COLUMNS.add(TableColumns.SIGNATURE_STR); COLUMNS.add(TableColumns.METADATA_STR + FetcherHbase.TMP_PARSE_MARK); } private ParseUtilHbase parseUtil; private SignatureHbase sig; private URLFilters filters; private URLNormalizers normalizers; private int maxOutlinks; private boolean ignoreExternalLinks; @Override public void configure(JobConf job) { super.configure(job); parseUtil = new ParseUtilHbase(job); sig = SignatureFactoryHbase.getSignature(job); filters = new URLFilters(job); normalizers = new URLNormalizers(job, URLNormalizers.SCOPE_OUTLINK); final int maxOutlinksPerPage = job.getInt("db.max.outlinks.per.page", 100); maxOutlinks = (maxOutlinksPerPage < 0) ? Integer.MAX_VALUE : maxOutlinksPerPage; ignoreExternalLinks = job.getBoolean("db.ignore.external.links", false); } @Override public void map(ImmutableBytesWritable key, RowResult rowResult, OutputCollector<ImmutableBytesWritable, RowPart> output, Reporter reporter) throws IOException { final RowPart row = new RowPart(rowResult); final String url = TableUtil.unreverseUrl(Bytes.toString(key.get())); if (!row.hasMeta(FetcherHbase.TMP_PARSE_MARK)) { return; } final byte status = row.getStatus(); if (status != CrawlDatumHbase.STATUS_FETCHED) { if (LOG.isDebugEnabled()) { LOG.debug("Skipping " + url + " as status: " + CrawlDatumHbase.getName(status)); } return; } ParseHbase parse; try { parse = parseUtil.parse(url, row); } catch (final Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); return; } final byte[] signature = sig.calculate(row, parse); final ParseStatus pstatus = parse.getParseStatus(); row.setParseStatus(pstatus); if (pstatus.isSuccess()) { if (pstatus.getMinorCode() == ParseStatus.SUCCESS_REDIRECT) { String newUrl = pstatus.getMessage(); final int refreshTime = Integer.parseInt(pstatus.getArgs()[1]); newUrl = normalizers.normalize(newUrl, URLNormalizers.SCOPE_FETCHER); try { newUrl = filters.filter(newUrl); if (newUrl == null || newUrl.equals(url)) { final String reprUrl = URLUtil.chooseRepr(url, newUrl, refreshTime < Fetcher.PERM_REFRESH_TIME); final String reversedUrl = TableUtil.reverseUrl(reprUrl); final ImmutableBytesWritable newKey = new ImmutableBytesWritable(reversedUrl.getBytes()); final RowPart newRow = new RowPart(newKey.get()); if (!reprUrl.equals(url)) { newRow.setReprUrl(reprUrl); } newRow.putMeta(FetcherHbase.REDIRECT_DISCOVERED, TableUtil.YES_VAL); output.collect(newKey, newRow); } } catch (final Exception e) { // ignore } } else { row.setText(parse.getText()); row.setTitle(parse.getTitle()); final byte[] prevSig = row.getSignature(); if (prevSig != null) { row.setPrevSignature(prevSig); } row.setSignature(signature); row.deleteAllOutlinks(); final Outlink[] outlinks = parse.getOutlinks(); final int count = 0; String fromHost; if (ignoreExternalLinks) { try { fromHost = new URL(url).getHost().toLowerCase(); } catch (final MalformedURLException e) { fromHost = null; } } else { fromHost = null; } for (int i = 0; count < maxOutlinks && i < outlinks.length; i++) { String toUrl = outlinks[i].getToUrl(); toUrl = normalizers.normalize(toUrl, URLNormalizers.SCOPE_OUTLINK); try { toUrl = filters.filter(toUrl); } catch (final URLFilterException e) { continue; } if (toUrl == null) { continue; } String toHost; if (ignoreExternalLinks) { try { toHost = new URL(toUrl).getHost().toLowerCase(); } catch (final MalformedURLException e) { toHost = null; } if (toHost == null || !toHost.equals(fromHost)) { // external links continue; // skip it } } row.addOutlink(new Outlink(toUrl, outlinks[i].getAnchor())); } row.putMeta(TMP_UPDATE_MARK, TableUtil.YES_VAL); } } output.collect(key, row); } @Override public void reduce(ImmutableBytesWritable key, Iterator<RowPart> values, OutputCollector<ImmutableBytesWritable, BatchUpdate> output, Reporter reporter) throws IOException { RowPart row = values.next(); row.deleteMeta(FetcherHbase.TMP_PARSE_MARK); output.collect(key, row.makeBatchUpdate()); } public void parse(String table) throws IOException { LOG.info("ParseHbase: starting"); LOG.info("ParseHbase: segment: " + table); final JobConf job = new NutchJob(getConf()); job.setJobName("parse-hbase " + table); TableMapReduce.initJob(table, getColumns(job), ParseTable.class, ImmutableBytesWritable.class, RowPart.class, job); JobClient.runJob(job); LOG.info("ParseHbase: done"); } private String getColumns(JobConf job) { final Set<String> columnSet = new HashSet<String>(COLUMNS); final ParserFactoryHbase parserFactory = new ParserFactoryHbase(job); columnSet.addAll(parserFactory.getColumnSet()); columnSet.addAll(SignatureFactoryHbase.getSignature(job).getColumnSet()); final HtmlParseFiltersHbase filters = new HtmlParseFiltersHbase(job); columnSet.addAll(filters.getColumnSet()); return TableUtil.getColumns(columnSet); } public int run(String[] args) throws Exception { final String usage = "Usage: ParseSegment <webtable>"; if (args.length == 0) { System.err.println(usage); System.exit(-1); } parse(args[0]); return 0; } public static void main(String[] args) throws Exception { final int res = ToolRunner.run(NutchConfiguration.create(), new ParseTable(), args); System.exit(res); } }