package org.docx4j.samples; import java.io.File; import java.io.IOException; import java.io.PrintStream; import org.apache.commons.io.FileUtils; import org.docx4j.Docx4J; import org.docx4j.anon.Anonymize; import org.docx4j.anon.AnonymizeResult; import org.docx4j.openpackaging.exceptions.Docx4JException; import org.docx4j.openpackaging.packages.WordprocessingMLPackage; import org.docx4j.openpackaging.parts.Part; public class AnonCorpus { private final static String DIR_IN = System.getProperty("user.dir") + "/corpus/"; private final static String DIR_OUT = System.getProperty("user.dir") + "/OUT6/"; private final static String DIR_HANDLED = System.getProperty("user.dir") + "/corpus-handled/"; private static final String DIR_OK = "ok"; private static final String DIR_LEAKS = "leaks"; private static final String DIR_ERRORS = "errors"; private static final String DIR_GLYPH = "glyph-issues"; private int oks = 0; private int leaks = 0; private int errors = 0; private StringBuffer sbLeaks = new StringBuffer(); public static void main(String[] args) throws Exception { AnonCorpus corpusAnon = new AnonCorpus(); corpusAnon.createDirs(); corpusAnon.walk(DIR_IN); System.out.println(corpusAnon.sbLeaks.toString()); System.out.println("leaks: " + corpusAnon.leaks); System.out.println("errors: " + corpusAnon.errors); System.out.println("oks: " + corpusAnon.oks); } private void createDirs() throws IOException { // create OK, leak dirs FileUtils.forceMkdir(new File(DIR_OUT+DIR_OK)); FileUtils.forceMkdir(new File(DIR_OUT+DIR_LEAKS)); FileUtils.forceMkdir(new File(DIR_OUT+DIR_ERRORS)); FileUtils.forceMkdir(new File(DIR_OUT+DIR_GLYPH)); } int docNum = 1; public void walk( String path ) throws IOException { File root = new File( path ); File[] list = root.listFiles(); if (list == null) return; for ( File f : list ) { if ( f.isDirectory() ) { walk( f.getAbsolutePath() ); //System.out.println( "Dir:" + f.getAbsoluteFile() ); } else { // System.out.println( "File:" + f.getAbsoluteFile() ); if (f.getName().endsWith("docx") || f.getName().endsWith("docm")) { try { handle(f) ; FileUtils.moveFile(f, new File(DIR_HANDLED + f.getName())); } catch (Exception e) { if (e.getMessage()!=null && e.getMessage().startsWith("Ran out of patience")) { FileUtils.copyFile(f, new File(DIR_OUT+DIR_GLYPH+"/" + f.getName()+".docx")); } else if (e.getMessage()!=null && e.getMessage().startsWith("This file seems to be a binary doc")) { FileUtils.copyFile(f, new File(DIR_OUT+DIR_ERRORS+"/" + f.getName()+".doc")); // rename the original FileUtils.moveFile(f, new File(f.getAbsolutePath()+".doc")); } else { errors++; e.printStackTrace(); FileUtils.copyFile(f, new File(DIR_OUT+DIR_ERRORS+"/" + f.getName())); File file = new File(DIR_OUT+DIR_ERRORS+"/" + f.getName() + "err.txt"); PrintStream ps = new PrintStream(file); e.printStackTrace(ps); ps.close(); } } docNum++; } } } } private void handle(File fIn) throws Docx4JException { System.out.println("\n\n " + docNum + " Processing " + fIn.getName() + "\n\n"); WordprocessingMLPackage pkg = null; try { pkg = Docx4J.load(fIn); } catch (ClassCastException e) { // eg dodgy docx: CustomXmlDataStoragePart cannot be cast to org.docx4j.openpackaging.parts.CustomXmlDataStoragePropertiesPart throw new Docx4JException(e.getMessage(), e); } Anonymize anon = new Anonymize(pkg); AnonymizeResult result = anon.go(); String lang = "default"; if (result.hasHiragana || result.hasKatakana) { lang="Japanese"; } else if (result.hasArabic) { lang="Arabic"; } else if (result.hasHebrew) { lang="Hebrew"; } else if (result.hasCyrillic) { lang="Cyrillic"; } else if (result.hasGreek) { lang="Greek"; } else if (result.hasCJK) { lang = "CJK"; } if (result.isOK()) { oks++; System.out.println("document successfully anonymised."); File dir = new File(DIR_OUT+DIR_OK+"/"+lang); dir.mkdirs(); Docx4J.save(pkg, new java.io.File(DIR_OUT+DIR_OK+"/"+lang + "/"+ fIn.getName())); } else { leaks++; // Report reportLeak("\n\n REPORT for " + fIn.getName() + "\n\n"); File dir = new File(DIR_OUT+DIR_LEAKS+"/"+lang); dir.mkdirs(); String outputfilepath = DIR_OUT+DIR_LEAKS+"/"+lang + "/"+ fIn.getName(); Docx4J.save(pkg, new java.io.File(outputfilepath)); reportLeak("document partially anonymised; please check " + outputfilepath); if (result.getUnsafeParts().size()>0) { reportLeak("The following parts may leak info:"); for(Part p : result.getUnsafeParts()) { reportLeak(p.getPartName().getName() + ", of type " + p.getClass().getName() ); } } // unsafe objects reportLeak(result.reportUnsafeObjects()); System.out.println("\n\n .. end REPORT for " + fIn.getName() + "\n\n"); } if (result.getFieldsPresent().size()>0) { for (String s : result.getFieldsPresent()) { System.out.println(s); } } } private void reportLeak(String message) { System.out.println(message); sbLeaks.append(message + "\n"); } }