/*
* Copyright (c) 2008 Mozilla Foundation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
package nu.validator.dmozdl;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.UnsupportedEncodingException;
import java.util.Properties;
import java.util.zip.GZIPOutputStream;
import nu.validator.xml.PrudentHttpEntityResolver;
import org.apache.log4j.PropertyConfigurator;
import org.mortbay.util.IO;
import org.xml.sax.InputSource;
public class Downloader implements Runnable {
private final BufferedReader in;
private final PrintWriter out;
private final File rootDir;
/**
* @param in
* @param out
* @param rootDir
* @param resolver
*/
public Downloader(BufferedReader in, PrintWriter out, File rootDir) {
this.in = in;
this.out = out;
this.rootDir = rootDir;
}
public void run() {
String inLine = null;
for (;;) {
try {
while ((inLine = in.readLine()) != null) {
String md5;
String url;
int index = inLine.indexOf('\t');
md5 = inLine.substring(0, index);
url = inLine.substring(index + 1, inLine.length());
InputSource is;
PrudentHttpEntityResolver resolver;
resolver = new PrudentHttpEntityResolver(1024*1024, false, null);
resolver.setAcceptAllKnownXmlTypes(false);
resolver.setAllowGenericXml(false);
resolver.setAllowRnc(false);
resolver.setAllowXhtml(false);
resolver.setAllowHtml(true);
try {
is = resolver.resolveEntity(null, url);
} catch (Exception e) {
continue;
}
String charset = is.getEncoding();
if (charset == null || charset.indexOf('\t') != -1) {
charset = "null";
}
File top = new File(rootDir, md5.substring(0, 2));
synchronized (rootDir) {
top.mkdir();
}
File second = new File(top, md5.substring(2, 4));
synchronized (rootDir) {
second.mkdir();
}
File outFile = new File(second, md5 + ".gz");
InputStream inStream = is.getByteStream();
try {
OutputStream outStream = new GZIPOutputStream(
new FileOutputStream(outFile));
IO.copy(inStream, outStream);
outStream.flush();
outStream.close();
} catch (Exception e) {
outFile.delete();
continue;
} finally {
inStream.close();
}
out.println(md5 + '\t' + url + '\t' + charset);
}
return;
} catch (Exception e) {
}
}
}
public static void main(String[] args) throws Exception {
PropertyConfigurator.configure(new Properties());
PrudentHttpEntityResolver.setUserAgent("Mozilla/5.0 (automated dmoz downloader)");
PrudentHttpEntityResolver.setParams(5000, 5000, 10);
BufferedReader in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]), "utf-8"));
PrintWriter out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(args[1]), "utf-8"), true);
File rootDir = new File(args[2]);
for (int i = 0; i < 8; i++) {
(new Thread(new Downloader(in, out, rootDir))).start();
}
}
}