package gate.mimir.test; import gate.Gate; import gate.mimir.DocumentRenderer; import gate.mimir.IndexConfig; import gate.mimir.MimirIndex; import gate.mimir.index.DocumentData; import java.io.BufferedWriter; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FilenameFilter; import java.io.FilterOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.OutputStreamWriter; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; import java.util.zip.ZipOutputStream; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.commons.lang.StringEscapeUtils; import org.apache.commons.lang.StringUtils; public class RenderZipCollection { /** * @param args */ public static void main(String[] args) throws Exception { Gate.setGateHome(new File("gate-home")); Gate.setUserConfigFile(new File("gate-home/user-gate.xml")); Gate.init(); // load the tokeniser plugin Gate.getCreoleRegister().registerDirectories( new File("gate-home/plugins/ANNIE-tokeniser").toURI().toURL()); // load the DB plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/db-h2").toURI().toURL()); // load the measurements plugin Gate.getCreoleRegister().registerDirectories( new File("../plugins/measurements").toURI().toURL()); Gate.getCreoleRegister().registerDirectories( new File("../plugins/sparql").toURI().toURL()); File indexDir = new File(args[0]); File outputDir = new File(args[1]); // renumbering rules if required int multiplier = Integer.getInteger("federatedIndex.size", 1); int offset = Integer.getInteger("federatedIndex.offset", 0); long minId = Long.getLong("minDocId", 0); // load the IndexConfig to obtain the right renderer IndexConfig indexConfig = IndexConfig.readConfigFromFile(new File(indexDir, MimirIndex.INDEX_CONFIG_FILENAME), indexDir); DocumentRenderer renderer = indexConfig.getDocumentRenderer(); // enumerate the zip collection files File[] zipCollectionFiles = indexDir.listFiles(new FilenameFilter() { @Override public boolean accept(File dir, String name) { return name.startsWith("mimir-collection-") && name.endsWith(".zip"); } }); for(File zf : zipCollectionFiles) { // for each input file, create a corresponding output file File outFile = new File(outputDir, "rendered-" + zf.getName()); File metaFile = new File(outputDir, "meta-" + zf.getName()); try(ZipInputStream collIn = new ZipInputStream(new FileInputStream(zf)); ZipOutputStream rendOut = new ZipOutputStream(new FileOutputStream(outFile)); ZipOutputStream metaOut = new ZipOutputStream(new FileOutputStream(metaFile))) { ZipEntry inEntry; while((inEntry = collIn.getNextEntry()) != null) { long docId = renumber(inEntry.getName(), multiplier, offset); if(docId >= minId) { // for each document, load the DocumentData from the original zip DocumentData dd = null; try(ObjectInputStream ois = new ObjectInputStream(new CloseShieldInputStream(collIn))) { dd = (DocumentData)ois.readObject(); } if(dd != null) { // and write the rendered form to the new zip (in UTF-8) ZipEntry outEntry = new ZipEntry(String.valueOf(docId)); rendOut.putNextEntry(outEntry); try(BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FilterOutputStream(rendOut) { @Override public void close() throws IOException { flush(); ((ZipOutputStream)out).closeEntry(); } }, "UTF-8"))) { renderer.render(dd, null, w); } // write the metadata entry as JSON ZipEntry metaEntry = new ZipEntry(docId + ".meta"); metaOut.putNextEntry(metaEntry); try(BufferedWriter w = new BufferedWriter(new OutputStreamWriter(new FilterOutputStream(metaOut) { @Override public void close() throws IOException { flush(); ((ZipOutputStream)out).closeEntry(); } }, "UTF-8"))) { w.write("{\"uri\":\""); StringEscapeUtils.escapeJavaScript(w, dd.getDocumentURI()); w.write("\",\"title\":\""); StringEscapeUtils.escapeJavaScript(w, dd.getDocumentTitle()); w.write("\"}"); } } else { System.out.println("Error converting document " + inEntry.getName()); } } } } } } /** * Renumber the original name to match the ID it would have in a federated index * if this were the (0-based) <code>offset</code>th index in a federation of size * <code>multiplier</code>. * @throws NumberFormatException */ public static long renumber(String originalName, int multiplier, int offset) throws NumberFormatException { return (Long.parseLong(originalName) * multiplier) + offset; } }