/* * This file is part of the Heritrix web crawler (crawler.archive.org). * * Licensed to the Internet Archive (IA) by one or more individual * contributors. * * The IA licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.archive.util.anvl; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.UnsupportedEncodingException; import java.util.Collection; import java.util.HashMap; import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.logging.Level; import java.util.logging.Logger; import org.archive.io.UTF8Bytes; /** * An ordered {@link List} with 'data' {@link Element} values. * ANVLRecords end with a blank line. * * @see <a * href="http://www.cdlib.org/inside/diglib/ark/anvlspec.pdf">A Name-Value * Language (ANVL)</a> * @author stack */ public class ANVLRecord extends LinkedList<Element> implements UTF8Bytes { private static final Logger logger = Logger.getLogger(ANVLRecord.class.getName()); public static final String MIMETYPE = "application/warc-fields"; public static final ANVLRecord EMPTY_ANVL_RECORD = new ANVLRecord(); /** * Arbitrary upper bound on maximum size of ANVL Record. * Will throw an IOException if exceed this size. */ public static final long MAXIMUM_SIZE = 1024 * 10; /** * An ANVL 'newline'. * @see <a href="http://en.wikipedia.org/wiki/CRLF">http://en.wikipedia.org/wiki/CRLF</a> */ protected static final String CRLF = "\r\n"; protected static final String FOLD_PREFIX = CRLF + ' '; public ANVLRecord() { super(); } public ANVLRecord(Collection<? extends Element> c) { super(c); } /** @deprecated */ public ANVLRecord(int initialCapacity) { super(); } public boolean addLabel(final String l) { return super.add(new Element(new Label(l))); } public boolean addLabelValue(final String l, final String v) { try { return super.add(new Element(new Label(l), new Value(v))); } catch (IllegalArgumentException e) { logger.log(Level.WARNING, "bad label " + l + " or value " + v, e); return false; } } @Override public String toString() { // TODO: What to emit for empty ANVLRecord? StringBuilder sb = new StringBuilder(); for (final Iterator<Element> i = iterator(); i.hasNext();) { sb.append(i.next()); sb.append(CRLF); } // 'ANVL Records end in a blank line'. sb.append(CRLF); return sb.toString(); } public Map<String, String> asMap() { Map<String, String> m = new HashMap<String, String>(size()); for (final Iterator<Element> i = iterator(); i.hasNext();) { Element e = i.next(); m.put(e.getLabel().toString(), e.isValue()? e.getValue().toString(): (String)null); } return m; } @Override public ANVLRecord clone() { return (ANVLRecord) super.clone(); } /** * @return This ANVLRecord as UTF8 bytes. */ public byte [] getUTF8Bytes() throws UnsupportedEncodingException { return toString().getBytes(UTF8); } /** * Parses a single ANVLRecord from passed InputStream. * Read as a single-byte stream until we get to a CRLFCRLF which * signifies End-of-ANVLRecord. Then parse all read as a UTF-8 Stream. * Doing it this way, while requiring a double-scan, it makes it so do not * need to be passed a RepositionableStream or a Stream that supports * marking. Also no danger of over-reading which can happen when we * wrap passed Stream with an InputStreamReader for doing UTF-8 * character conversion (See the ISR class comment). * @param is InputStream * @return An ANVLRecord instance. * @throws IOException */ public static ANVLRecord load(final InputStream is) throws IOException { // It doesn't look like a CRLF sequence is possible in UTF-8 without // it signifying CRLF: The top bits are set in multibyte characters. // Was thinking of recording CRLF as I was running through this first // parse but the offsets would then be incorrect if any multibyte // characters in the intervening gaps between CRLF. boolean isCRLF = false; boolean recordStart = false; ByteArrayOutputStream baos = new ByteArrayOutputStream(1024); boolean done = false; int read = 0; for (int c = -1, previousCharacter; !done;) { if (read++ >= MAXIMUM_SIZE) { throw new IOException("Read " + MAXIMUM_SIZE + " bytes without finding \\r\\n\\r\\n " + "End-Of-ANVLRecord"); } previousCharacter = c; c = is.read(); if (c == -1) { throw new IOException("End-Of-Stream before \\r\\n\\r\\n " + "End-Of-ANVLRecord:\n" + new String(baos.toByteArray(), UTF8)); } if (isLF((char)c) && isCR((char)previousCharacter)) { if (isCRLF) { // If we just had a CRLF, then its two CRLFs and its end of // record. We're done. done = true; } else { isCRLF = true; } } else if (!recordStart && Character.isWhitespace(c)) { // Skip any whitespace at start of ANVLRecord. continue; } else { // Clear isCRLF flag if this character is NOT a '\r'. if (isCRLF && !isCR((char)c)) { isCRLF = false; } // Not whitespace so start record if we haven't already. if (!recordStart) { recordStart = true; } } baos.write(c); } return load(new String(baos.toByteArray(), UTF8)); } /** * Parse passed String for an ANVL Record. * Looked at writing javacc grammer but preprocessing is required to * handle folding: See * https://javacc.dev.java.net/servlets/BrowseList?list=users&by=thread&from=56173. * Looked at Terence Parr's ANTLR. More capable. Can set lookahead count. * A value of 3 would help with folding. But its a pain defining UNICODE * grammers -- needed by ANVL -- and support seems incomplete * anyways: http://www.doc.ic.ac.uk/lab/secondyear/Antlr/lexer.html#unicode. * For now, go with the below hand-rolled parser. * @param s String with an ANVLRecord. * @return ANVLRecord parsed from passed String. * @throws IOException */ public static ANVLRecord load(final String s) throws IOException { ANVLRecord record = new ANVLRecord(); boolean inValue = false, inLabel = false, inComment = false, inNewLine = false; String label = null; StringBuilder sb = new StringBuilder(s.length()); for (int i = 0; i < s.length(); i++) { char c = s.charAt(i); // Assert I can do look-ahead. if ((i + 1) > s.length()) { throw new IOException("Premature End-of-ANVLRecord:\n" + s.substring(i)); } // If at LF of a CRLF, just go around again. Eat up the LF. if (inNewLine && isLF(c)) { continue; } // If we're at a CRLF and we were just on one, exit. Found Record. if (inNewLine && isCR(c) && isLF(s.charAt(i + 1))) { break; } // Check if we're on a fold inside a Value. Skip multiple white // space after CRLF. if (inNewLine && inValue && Character.isWhitespace(c)) { continue; } // Else set flag if we're at a CRLF. inNewLine = isCR(c) && isLF(s.charAt(i + 1)); if (inNewLine) { if (inComment) { inComment = false; } else if (label != null && !inValue) { // Label only 'data element'. record.addLabel(label); label = null; sb.setLength(0); } else if (inValue) { // Assert I can do look-ahead past current CRLF. if ((i + 3) > s.length()) { throw new IOException("Premature End-of-ANVLRecord " + "(2):\n" + s.substring(i)); } if (!isCR(s.charAt(i + 2)) && !isLF(s.charAt(i + 3)) && Character.isWhitespace(s.charAt(i + 2))) { // Its a fold. Let it go around. But add in a CRLF and // space and do it here. We don't let CRLF fall through // to the sb.append on the end of this loop. sb.append(CRLF); sb.append(' '); } else { // Next line is a new SubElement, a new Comment or // Label. record.addLabelValue(label, sb.toString()); sb.setLength(0); label = null; inValue = false; } } else { // We're whitespace between label and value or whitespace // before we've figured whether label or comment. } // Don't let the '\r' or CRLF through. continue; } if (inComment) { continue; } else if (inLabel) { if (c == Label.COLON) { label = sb.toString(); sb.setLength(0); inLabel = false; continue; } } else { if (!inLabel && !inValue && !inComment) { // We have no state. Figure one. if (Character.isWhitespace(c)) { // If no state, and whitespace, skip. Don't record. continue; } else if (label == null && c == '#') { inComment = true; // Don't record comments. continue; } else if (label == null) { inLabel = true; } else { inValue = true; } } } sb.append(c); } return record; } /** * @return Count of ANVLRecord bytes. Be careful, an empty ANVLRecord is * CRLFCRLF so is of size 4. Also, expensive, since it makes String of * the record so it can count bytes. */ public synchronized int getLength() { int length = -1; try { length = getUTF8Bytes().length; } catch (UnsupportedEncodingException e) { throw new RuntimeException(e); } return length; } public static boolean isCROrLF(final char c) { return isCR(c) || isLF(c); } public static boolean isCR(final char c) { return c == ANVLRecord.CRLF.charAt(0); } public static boolean isLF(final char c) { return c == ANVLRecord.CRLF.charAt(1); } }