/** * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse; import java.io.*; import java.util.*; import org.apache.hadoop.io.*; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.*; import org.apache.nutch.metadata.Metadata; import org.apache.nutch.util.NutchConfiguration; /** Data extracted from a page's content. * @see Parse#getData() */ public final class ParseData extends VersionedWritable { public static final String DIR_NAME = "parse_data"; private final static byte VERSION = 5; private String title; private Outlink[] outlinks; private Metadata contentMeta; private Metadata parseMeta; private ParseStatus status; private byte version = VERSION; public ParseData() { contentMeta = new Metadata(); parseMeta = new Metadata(); } public ParseData(ParseStatus status, String title, Outlink[] outlinks, Metadata contentMeta) { this(status, title, outlinks, contentMeta, new Metadata()); } public ParseData(ParseStatus status, String title, Outlink[] outlinks, Metadata contentMeta, Metadata parseMeta) { this.status = status; this.title = title; this.outlinks = outlinks; this.contentMeta = contentMeta; this.parseMeta = parseMeta; } // // Accessor methods // /** The status of parsing the page. */ public ParseStatus getStatus() { return status; } /** The title of the page. */ public String getTitle() { return title; } /** The outlinks of the page. */ public Outlink[] getOutlinks() { return outlinks; } /** The original Metadata retrieved from content */ public Metadata getContentMeta() { return contentMeta; } /** * Other content properties. * This is the place to find format-specific properties. * Different parser implementations for different content types will populate * this differently. */ public Metadata getParseMeta() { return parseMeta; } public void setParseMeta(Metadata parseMeta) { this.parseMeta = parseMeta; } /** * Get a metadata single value. * This method first looks for the metadata value in the parse metadata. If no * value is found it the looks for the metadata in the content metadata. * @see #getContentMeta() * @see #getParseMeta() */ public String getMeta(String name) { String value = parseMeta.get(name); if (value == null) { value = contentMeta.get(name); } return value; } // // Writable methods // public byte getVersion() { return version; } public final void readFields(DataInput in) throws IOException { version = in.readByte(); // incompatible change from UTF8 (version < 5) to Text if (version != VERSION) throw new VersionMismatchException(VERSION, version); status = ParseStatus.read(in); title = Text.readString(in); // read title int numOutlinks = in.readInt(); outlinks = new Outlink[numOutlinks]; for (int i = 0; i < numOutlinks; i++) { outlinks[i] = Outlink.read(in); } if (version < 3) { int propertyCount = in.readInt(); // read metadata contentMeta.clear(); for (int i = 0; i < propertyCount; i++) { contentMeta.add(Text.readString(in), Text.readString(in)); } } else { contentMeta.clear(); contentMeta.readFields(in); } if (version > 3) { parseMeta.clear(); parseMeta.readFields(in); } } public final void write(DataOutput out) throws IOException { out.writeByte(VERSION); // write version status.write(out); // write status Text.writeString(out, title); // write title out.writeInt(outlinks.length); // write outlinks for (int i = 0; i < outlinks.length; i++) { outlinks[i].write(out); } contentMeta.write(out); // write content metadata parseMeta.write(out); } public static ParseData read(DataInput in) throws IOException { ParseData parseText = new ParseData(); parseText.readFields(in); return parseText; } // // other methods // public boolean equals(Object o) { if (!(o instanceof ParseData)) return false; ParseData other = (ParseData)o; return this.status.equals(other.status) && this.title.equals(other.title) && Arrays.equals(this.outlinks, other.outlinks) && this.contentMeta.equals(other.contentMeta) && this.parseMeta.equals(other.parseMeta); } public String toString() { StringBuffer buffer = new StringBuffer(); buffer.append("Version: " + version + "\n" ); buffer.append("Status: " + status + "\n" ); buffer.append("Title: " + title + "\n" ); if (outlinks != null) { buffer.append("Outlinks: " + outlinks.length + "\n" ); for (int i = 0; i < outlinks.length; i++) { buffer.append(" outlink: " + outlinks[i] + "\n"); } } buffer.append("Content Metadata: " + contentMeta + "\n" ); buffer.append("Parse Metadata: " + parseMeta + "\n" ); return buffer.toString(); } public static void main(String argv[]) throws Exception { String usage = "ParseData (-local | -dfs <namenode:port>) recno segment"; if (argv.length < 3) { System.out.println("usage:" + usage); return; } Configuration conf = NutchConfiguration.create(); FileSystem fs = FileSystem.parseArgs(argv, 0, conf); try { int recno = Integer.parseInt(argv[0]); String segment = argv[1]; Path file = new Path(segment, DIR_NAME); System.out.println("Reading from file: " + file); ArrayFile.Reader parses = new ArrayFile.Reader(fs, file.toString(), conf); ParseData parseDatum = new ParseData(); parses.get(recno, parseDatum); System.out.println("Retrieved " + recno + " from file " + file); System.out.println(parseDatum); parses.close(); } finally { fs.close(); } } }