/* * Cloud9: A MapReduce Library for Hadoop * * Licensed under the Apache License, Version 2.0 (the "License"); you * may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. See the License for the specific language governing * permissions and limitations under the License. */ package edu.umd.cloud9.collection.spinn3r; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.text.DateFormat; import java.text.ParseException; import java.text.SimpleDateFormat; import java.util.Date; import org.apache.hadoop.io.WritableUtils; import edu.umd.cloud9.collection.Indexable; public class Spinn3rItem extends Indexable { public static final String XML_START_TAG = "<item>"; public static final String XML_END_TAG = "</item>"; private String mItem; private String mTitle; private String mGuid; private String mLanguage; private String mDescription; private Date mPubDate; public Spinn3rItem() { } public void write(DataOutput out) throws IOException { byte[] bytes = mItem.getBytes(); WritableUtils.writeVInt(out, bytes.length); out.write(bytes, 0, bytes.length); } public void readFields(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte[] bytes = new byte[length]; in.readFully(bytes, 0, length); Spinn3rItem.readItem(this, new String(bytes)); } public String getDocid() { return mGuid; } public void setDocid(String docid) { mGuid = docid; } public String getContent() { return getTitle() + "\n" + getDescription(); } public String getRawXML() { return mItem; } public String getTitle() { return mTitle; } public String getDescription() { return mDescription; } public String getGuid() { return mGuid; } public String getLanguage() { return mLanguage; } public Date getPubDate() { if (mPubDate == null) { int start = mItem.indexOf("<pubDate>"); int end = mItem.indexOf("</pubDate>", start); String s = mItem.substring(start + 9, end); try { DateFormat format = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z"); mPubDate = format.parse(s); } catch (ParseException e) { e.printStackTrace(); } } return mPubDate; } public static void readItem(Spinn3rItem item, String s) { item.mItem = s; // parse out title int start = s.indexOf("<title>"); int end = s.indexOf("</title>", start); item.mTitle = s.substring(start + 7, end); // parse out guid start = s.indexOf("<guid>"); end = s.indexOf("</guid>", start); item.mGuid = s.substring(start + 6, end); // parse out actual text of article start = s.indexOf("<description>"); end = s.indexOf("</description>", start); item.mDescription = s.substring(start + 13, end); start = s.indexOf("<dc:lang>"); end = s.indexOf("</dc:lang>", start); item.mLanguage = s.substring(start + 9, end); item.mPubDate = null; } }