package org.archive.hadoop.jobs; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.util.logging.Logger; import java.util.regex.Pattern; import org.apache.commons.httpclient.URIException; import org.archive.url.DefaultIAURLCanonicalizer; import org.archive.url.HandyURL; import org.archive.url.NonMassagingIAURLCanonicalizer; import org.archive.url.URLCanonicalizer; import org.archive.url.URLParser; import org.archive.url.URLRegexTransformer; public class CDXTransformer { private final static Logger LOG = Logger.getLogger(CDXTransformer.class.getCanonicalName()); private PrintWriter out; private char delim = ' '; private URLCanonicalizer can = new DefaultIAURLCanonicalizer(); public CDXTransformer(PrintWriter out) { this.out = out; } private final static Pattern SPACE_PATTERN = Pattern.compile(" "); public static void main(String args[]) throws IOException { String line; BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(System.out)); PrintWriter pw = new PrintWriter(bw); CDXTransformer t = new CDXTransformer(pw); BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); if(args.length > 0) { if(args[0].equals("--no-massage")) { t.setCan(new NonMassagingIAURLCanonicalizer()); } } while(true) { line = br.readLine(); if(line == null) { break; } t.output(line); } pw.flush(); } public void output(String cdxIn) { // assume fields: N b a m s k r V g // url-key (old) // timestamp // orig-url // mime // http-code // digest // redirect // (OPTIONAL robot-flag) // start-offset // filename String parts[] = SPACE_PATTERN.split(cdxIn, 11); int offsetIdx = -1; if(parts.length == 9) { offsetIdx = 7; } else if(parts.length == 10) { if(parts[7].contains("A")) { // NO-ARCHIVE HTML meta instruction return; } offsetIdx = 8; } else { LOG.warning("Bad format line:\t" + cdxIn); return; } // String urlKey = parts[0]; String captureTS = parts[1]; String originalUrl = parts[2]; String mimeType = parts[3]; String httpCode = parts[4]; String digest = parts[5]; String redirectUrl = parts[6]; long compressedOffset = -1; try { compressedOffset = Long.parseLong(parts[offsetIdx]); } catch (NumberFormatException e) { LOG.warning("Bad compressed Offset field("+parts[offsetIdx]+") in (" + cdxIn +")"); return; } String filename = parts[offsetIdx+1]; HandyURL h; try { h = URLParser.parse(originalUrl); } catch (URIException e) { LOG.warning(String.format("Bad original URL(%s) error(%s)", originalUrl,e.getMessage())); return; } can.canonicalize(h); // StringBuilder sb = new StringBuilder(cdxIn.length() + 5); // sb.append(h.getPublicSuffix()).append(delim); // sb.append(h.getPathQuery()).append(delim); // sb.append(captureTS).append(delim); // // sb.append(nullToDash(h.getPublicPrefix())).append(delim); // sb.append(nullToDash(h.getScheme())).append(delim); // sb.append(originalUrl).append(delim); // sb.append(nullToDash(mimeType)).append(delim); // sb.append(nullToDash(httpCode)).append(delim); // sb.append(nullToDash(digest)).append(delim); // sb.append(nullToDash(redirectUrl)).append(delim); // sb.append(compressedOffset).append(delim); // sb.append(nullToDash(filename)); // StringBuilder sb = new StringBuilder(cdxIn.length() + 5); out.print("("); out.print(URLRegexTransformer.hostToSURT(h.getPublicSuffix())); out.print(delim); out.print(h.getPathQuery()); out.print(delim); out.print(captureTS); out.print(delim); out.print(nullToDash(h.getPublicPrefix())); out.print(delim); out.print(nullToDash(h.getScheme())); out.print(delim); out.print(originalUrl); out.print(delim); out.print(nullToDash(mimeType)); out.print(delim); out.print(nullToDash(httpCode)); out.print(delim); out.print(nullToDash(digest)); out.print(delim); out.print(nullToDash(redirectUrl)); out.print(delim); out.print(compressedOffset); out.print(delim); out.print(nullToDash(filename)); ; out.println(); } private static final String nullToDash(String in) { return ((in == null) || (in.length() == 0)) ? "-" : in; } /** * @return the can */ public URLCanonicalizer getCan() { return can; } /** * @param can the can to set */ public void setCan(URLCanonicalizer can) { this.can = can; } }