package edu.umd.cloud9.collection.pmc; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableUtils; import edu.umd.cloud9.collection.Indexable; public class PmcArticle extends Indexable { public static final String XML_START_TAG = "<article "; public static final String XML_END_TAG = "</article>"; private String mPmcid; private String mDOI; private String mArticleText; public PmcArticle() { } public void write(DataOutput out) throws IOException { byte[] bytes = mArticleText.getBytes(); WritableUtils.writeVInt(out, bytes.length); out.write(bytes, 0, bytes.length); } public void readFields(DataInput in) throws IOException { int length = WritableUtils.readVInt(in); byte[] bytes = new byte[length]; in.readFully(bytes, 0, length); PmcArticle.readArticle(this, new String(bytes)); } public String getDocid() { return getPmcid(); } public String getContent() { return ""; } public String getPmcid() { if (mPmcid == null) { int start = mArticleText.indexOf("<article-id pub-id-type=\"pmc\">"); if (start == -1) { throw new RuntimeException(getRawXML()); } else { int end = mArticleText.indexOf("</article-id>", start); mPmcid = mArticleText.substring(start + 30, end); } } return mPmcid; } public String getDOI() { if (mDOI == null) { int start = mArticleText.indexOf("<article-id pub-id-type=\"doi\">"); if (start == -1) { mDOI = ""; } else { int end = mArticleText.indexOf("</article-id>", start); mDOI = mArticleText.substring(start + 30, end); } } return mDOI; } public String getReferencesXML() { int start = mArticleText.indexOf("<ref-list"); if (start == -1) return ""; int end = mArticleText.indexOf("</ref-list>", start); return mArticleText.substring(start, end); } public String getRawXML() { return mArticleText; } public static void readArticle(PmcArticle article, String s) { if (s == null) { throw new RuntimeException("Error, can't read null string!"); } article.mArticleText = s; article.mPmcid = null; article.mDOI = null; } }