package org.nines; import java.io.IOException; import java.io.PrintStream; import java.io.PrintWriter; import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.net.URL; import java.text.DecimalFormat; import java.util.ArrayList; import java.util.Arrays; import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Map.Entry; import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; import org.apache.log4j.xml.DOMConfigurator; import com.google.gson.JsonArray; import com.google.gson.JsonElement; import com.google.gson.JsonObject; /** * RDF Compare will perform comparisions on the target arcive and the main SOLR index. * * @author loufoster * */ public class RDFCompare { private RDFIndexerConfig config; private boolean includesText = false; private Logger log; private Logger txtLog; private PrintStream sysOut; private LinkedHashMap<String, List<String>> errors = new LinkedHashMap<String, List<String>>(); private int errorCount = 0; private int txtErrorCount = 0; private SolrClient solrClient; // private static final ArrayList<String> LARGE_TEXT_ARCHIVES = new ArrayList<String>( Arrays.asList( // "PQCh-EAF", "amdeveryday", "amdecj", "oldBailey" )); private static final ArrayList<String> REQUIRED_FIELDS = new ArrayList<String>(Arrays.asList("title_sort", "title", "genre", "archive", "url", "federation", "year_sort", "year_sort_asc", "year_sort_desc", "freeculture", "is_ocr")); private static final ArrayList<String> REQUIRED_PAGES_FIELDS = new ArrayList<String>(Arrays.asList("text", "page_num", "page_of")); /** * Construct an instance of the RDFCompare with the specified config * @param config * @throws IOException */ public RDFCompare(RDFIndexerConfig config) { this.config = config; String logFileRoot = this.config.getLogfileBaseName(""); String compareLog = logFileRoot + "_compare.log"; String skippedLog = logFileRoot + "_skipped.log"; String compareTxtLog = logFileRoot + "_compare_text.log"; System.setProperty("compare.log.file", compareLog); System.setProperty("compare.text.log.file", compareTxtLog); System.setProperty("skipped.log.file", skippedLog); URL url = ClassLoader.getSystemResource("log4j-compare.xml"); DOMConfigurator.configure(url); // init logging this.log = Logger.getLogger("compare"); this.txtLog = Logger.getLogger("compareTxt"); // set up sys out so it can handle utf-8 output try { this.sysOut = new PrintStream(System.out, true, "UTF-8"); } catch (UnsupportedEncodingException e) { this.sysOut = null; } // init the solr connection this.solrClient = new SolrClient(this.config.solrBaseURL); } /** * Perform the comparison based on the config passed into the c'tor */ public void compareArchive() { // log start time Date start = new Date(); this.log.info("Started compare at " + start); logInfo("====== Scanning archive \"" + config.archiveName + "\" ====== "); // get the list of fields and determine if we are to include text String fl = config.getFieldList( ); if( fl.contains( "text" ) == true ) includesText = true; if( fl.equals( "*" ) == true ) includesText = true; // Start at beginning of list and return 500 hits at a time int page = 0; int size = this.config.pageSize; List<JsonObject> archiveDocs = new ArrayList<JsonObject>(); HashMap<String, JsonObject> indexHash = new HashMap<String, JsonObject>(); Set<String> indexUris = new HashSet<String>(); Set<String> archiveUris = new HashSet<String>(); String reindexCore = config.coreName( ); // When fieldlist includes test, and the archive is one that contains // large text fields, limit page size to 1 // if ( this.includesText && LARGE_TEXT_ARCHIVES.contains(config.archiveName)) { // size = 1; // } // counts for text size int totalText = 0; int maxTextSize = 0; int docsWithText = 0; int maxText2 = 0; int maxText5 = 0; int maxText10 = 0; int maxText50 = 0; int maxText100 = 0; int maxText200 = 0; int maxText500 = 0; int maxText1000 = 0; int maxText2000 = 0; int maxText5000 = 0; int maxText10000 = 0; int runningText2 = 0; int runningText5 = 0; int runningText10 = 0; int runningText50 = 0; int runningText100 = 0; int runningText200 = 0; int runningText500 = 0; int runningText1000 = 0; int runningText2000 = 0; int runningText5000 = 0; int runningText10000 = 0; int count = 0; DecimalFormat df = new DecimalFormat(); // read a page of docs back from index and archive. Compare the page hits. // If comparisons were complete, remove the docs from lists. // Repeat til all lists are gone. boolean archiveDone = false; boolean indexDone = false; while ( true ) { List<JsonObject> pageHits = null; // get hits from archive, tally totals and check for end if ( archiveDone == false ) { pageHits = this.solrClient.getResultsPage( reindexCore, config.archiveName, page, size, fl, null, null ); if (pageHits.size() < size) { archiveDone = true; } // save off the set of uris for the archived docs for (JsonObject doc : pageHits) { int thisSize = 0; if (doc.has("text")) { docsWithText++; thisSize = doc.get("text").getAsString().length(); totalText += thisSize; if (thisSize > maxTextSize) maxTextSize = thisSize; } runningText2 += thisSize; runningText5 += thisSize; runningText10 += thisSize; runningText50 += thisSize; runningText100 += thisSize; runningText200 += thisSize; runningText500 += thisSize; runningText1000 += thisSize; runningText2000 += thisSize; runningText5000 += thisSize; runningText10000 += thisSize; count++; if (count % 2 == 0) { if (runningText2 > maxText2) maxText2 = runningText2; runningText2 = 0; } if (count % 5 == 0) { if (runningText5 > maxText5) maxText5 = runningText5; runningText5 = 0; } if (count % 10 == 0) { if (runningText10 > maxText10) maxText10 = runningText10; runningText10 = 0; } if (count % 50 == 0) { if (runningText50 > maxText50) maxText50 = runningText50; runningText50 = 0; } if (count % 100 == 0) { if (runningText100 > maxText100) maxText100 = runningText100; runningText100 = 0; } if (count % 200 == 0) { if (runningText200 > maxText200) maxText200 = runningText200; runningText200 = 0; } if (count % 500 == 0) { if (runningText500 > maxText500) maxText500 = runningText500; runningText500 = 0; } if (count % 1000 == 0) { if (runningText1000 > maxText1000) maxText1000 = runningText1000; runningText1000 = 0; } if (count % 2000 == 0) { if (runningText2000 > maxText2000) maxText2000 = runningText2000; runningText2000 = 0; } if (count % 5000 == 0) { if (runningText5000 > maxText5000) maxText5000 = runningText5000; runningText5000 = 0; } if (count % 10000 == 0) { if (runningText10000 > maxText10000) maxText10000 = runningText10000; runningText10000 = 0; } archiveDocs.add(doc); archiveUris.add(doc.get("uri").getAsString()); } } // get index docs if ( indexDone == false) { String core = "resources"; if ( this.config.isPagesArchive() ) { core = "pages"; } pageHits = this.solrClient.getResultsPage( core, config.archiveName, page, size, fl, null, null ); if (pageHits.size() < size) { indexDone = true; } // hash the indexed docs by uri to speed stuff up for (JsonObject doc : pageHits) { String uri = doc.get("uri").getAsString(); indexHash.put(uri, doc); indexUris.add(uri); } } // compare. This will also remove processed docs from each compareLists(indexHash, archiveDocs); // next page? if ( archiveDone == true && indexDone == true ) { break; } else { page++; } } if (runningText2 > maxText2) maxText2 = runningText2; if (runningText5 > maxText5) maxText5 = runningText5; if (runningText10 > maxText10) maxText10 = runningText10; if (runningText50 > maxText50) maxText50 = runningText50; if (runningText100 > maxText100) maxText100 = runningText100; if (runningText200 > maxText200) maxText200 = runningText200; if (runningText500 > maxText500) maxText500 = runningText500; if (runningText1000 > maxText1000) maxText1000 = runningText1000; if (runningText2000 > maxText2000) maxText2000 = runningText2000; if (runningText5000 > maxText5000) maxText5000 = runningText5000; if (runningText10000 > maxText10000) maxText10000 = runningText10000; // if there's stuff left in the archiveDocs, and we are looking at text, dump it if (archiveDocs.size() > 0 && this.includesText) { this.txtLog.info(" ============================= TEXT ADDED TO ARCHIVE ==========================="); for (JsonObject doc : archiveDocs) { this.txtLog .info("---------------------------------------------------------------------------------------------------------------"); this.txtLog.info(" --- " + doc.get("uri").getAsString() + " ---"); if (doc.has("text")) { this.txtLog.info(doc.get("text").getAsString()); this.txtErrorCount++; } } this.txtLog .info("---------------------------------------------------------------------------------------------------------------"); } // done log some stats this.log.info("Total Docs Scanned: " + archiveUris.size() + ". Total Errors: " + this.errorCount + "."); this.log.info(" retrieved " + archiveUris.size() + " new objects;"); this.log.info(" retrieved " + indexUris.size() + " old objects;"); if (this.includesText) { this.txtLog.info("Total Docs Scanned: " + archiveUris.size() + ". Total Errors: " + this.txtErrorCount + "."); } this.txtLog.info("Largest Text Size: " + df.format(maxTextSize) + "."); this.txtLog.info("Number of Docs with Text: " + df.format(docsWithText) + "."); this.txtLog.info("Total Text Size: " + df.format(totalText) + "."); this.txtLog.info("Running Text Sizes:\n2=" + df.format(maxText2) + "\n5=" + df.format(maxText5) + "\n10=" + df.format(maxText10) + "\n50=" + df.format(maxText50) + "\n100=" + df.format(maxText100) + "\n200=" + df.format(maxText200) + "\n500=" + df.format(maxText500) + "\n1000=" + df.format(maxText1000) + "\n2000=" + df.format(maxText2000) + "\n5000=" + df.format(maxText5000) + "\n10000=" + df.format(maxText10000)); Date end = new Date(); double durationSec = (end.getTime() - start.getTime()) / 1000.0; if (durationSec >= 60) { logInfo(String.format("JAVA Finished in %3.2f minutes.", (durationSec / 60.0))); } else { logInfo(String.format("JAVA Finished in %3.2f seconds.", durationSec)); } // now check for skipped stuff doSkippedTest(indexUris, archiveUris); } private void logErrors() { for (Map.Entry<String, List<String>> entry : this.errors.entrySet()) { String uri = entry.getKey(); if (uri.equals("txt")) { for (String msg : entry.getValue()) { logInfo(msg); } } else { logInfo("---" + uri + "---"); for (String msg : entry.getValue()) { logInfo(" " + msg); } } } this.errors.clear(); } /** * Compare the set of URIs in the index ad archive. List out all new documents and * all old. Show a skipped count (skipped is a doc in the original index, but not * the archive * @param indexUris Set uf URIs from the main index * @param archiveUris List of SolrDocuments in the index */ private void doSkippedTest(Set<String> indexUris, Set<String> archiveUris) { // set up logger just for skipped files Logger skippedLog = Logger.getLogger("skipped"); Date started = new Date(); skippedLog.info("Started: " + started); skippedLog.info("====== Scanning archive \"" + config.archiveName + "\" ====== "); skippedLog.info("retrieved " + archiveUris.size() + " new objects;"); skippedLog.info("retrieved " + indexUris.size() + " old objects;"); Set<String> oldOnly = new HashSet<String>(indexUris); oldOnly.removeAll(archiveUris); archiveUris.removeAll(indexUris); for (String uri : oldOnly) { skippedLog.info(" Old: " + uri); } for (String uri : archiveUris) { skippedLog.info(" New: " + uri); } skippedLog.info("Total not indexed: " + oldOnly.size() + ". Total new: " + archiveUris.size() + "."); } /** * Scan thru each document in the archive and find differences * @param indexHash List of all original docs in the index * @param archiveDocs List of docs in the reindexed archive * @throws Exception */ private void compareLists(HashMap<String, JsonObject> indexHash, List<JsonObject> archiveDocs) { // Run thru al items in new archive. Validate correct data // and compare against object in original index if possible Iterator<JsonObject> itr = archiveDocs.iterator(); while (itr.hasNext()) { // look up the corresponding object in the original index JsonObject doc = itr.next(); String uri = doc.get("uri").getAsString(); JsonObject indexDoc = indexHash.get(uri); // If we have matches do the work if (indexDoc != null) { // On full compares, validaate all required // fields are present and contain content if (this.config.ignoreFields.length() == 0 && this.config.includeFields.equals("*")) { validateRequiredFields(doc); } // comapre all fields try { compareFields(uri, indexDoc, doc); } catch (Exception e) { addError(uri, "Threw exception during compareFields: "+e.toString() ); StringWriter sw = new StringWriter(); e.printStackTrace( new PrintWriter(sw) ); addError(uri, "Stack Trace:\n\n"+sw.toString()); } // dump results logErrors(); // done with them indexHash.remove(uri); itr.remove(); } } } /** * Walk through each field in the new doc and compare it with the * old. Log any differences. * @param uri * @param indexDoc * @param doc */ private void compareFields(String uri, JsonObject indexDoc, JsonObject doc) { // loop over all keys in doc for (Entry<String, JsonElement> entry : doc.entrySet()) { // get key and do special handing for text fields String key = entry.getKey(); if (key.equals("text")) { compareText(uri, indexDoc, doc); continue; } // grab new val String newVal = toSolrString(entry.getValue()); // is this a new key? if (indexDoc.has(key) == false) { if (isIgnoredNewField(key) == false) { addError(uri, key + " " + newVal.replaceAll("\n", " / ") + " introduced in reindexing."); } continue; } // get parallel val in indexDoc String oldVal = toSolrString(indexDoc.get(key)); // dump the key from indexDoc so we can detect // unindexed values later indexDoc.remove(key); // don't compare score or indexing dates. if (key.equals("score") || key.equals("date_updated") || key.equals("date_created") || key.equals("_version_") ) { continue; } // difference? if (newVal.equals(oldVal) == false) { // make sure everything is escaped and check again. String escapedOrig = getProcessedOrigField(oldVal); String escapedNew = getProcessedReindexedField(newVal); if (escapedNew.equals(escapedOrig) == false) { // too long to dump in a single error line? if (oldVal.length() > 30) { // log a summary addError(uri, key + " mismatched: length= " + newVal.length() + " (new)" + " vs. " + oldVal.length() + " (old)", true); // then find first diff and log it String[] oldArray = oldVal.split("\n"); String[] newArray = newVal.split("\n"); for (int i = 0; i <= oldArray.length; i++) { if (oldArray[i].equals(newArray[i]) == false) { addError(uri, " at line " + i + ":\n" + "\"" + newArray[i].replaceAll("\n", " / ") + "\" vs.\n" + "\"" + oldArray[i].replaceAll("\n", " / ") + "\"", true); break; } } } else { // dump the entire diff to the log addError(uri, key + " mismatched: \"" + newVal.replaceAll("\n", " / ") + "\" (new)" + " vs. \"" + oldVal.replaceAll("\n", " / ") + "\" (old)"); } } } } // now see if there are any leftover fields in indexDoc // log them is not reindexed for (Entry<String, JsonElement> entry : indexDoc.entrySet()) { String val = toSolrString(entry.getValue()); String key = entry.getKey(); if (isIgnoredOldField(key) == false) { if (val.length() > 100) { val = val.substring(0, 100); } addError(uri, "Key not reindexed: " + key + "=" + val, true); } } } /** * Convert an Entry contaning solr data to a string * @param obj * @return The string data represented by the object */ private final String toSolrString(final JsonElement obj) { if (obj.isJsonArray()) { JsonArray jsonArray = (JsonArray) obj; Iterator<JsonElement> itr = jsonArray.iterator(); StringBuilder out = new StringBuilder(); while (itr.hasNext()) { if (out.length() > 0) { out.append(" | "); } out.append(itr.next().getAsString()); } return out.toString(); } return obj.getAsString(); } /** * Compare just the TEXT field of the index and archive docs * @param uri * @param indexDoc * @param doc */ private void compareText(String uri, JsonObject indexDoc, JsonObject doc) { String newTxt = null; if (doc.has("text")) { newTxt = doc.get("text").getAsString(); } String oldTxt = null; if (indexDoc.has("text")) { oldTxt = indexDoc.get("text").getAsString(); indexDoc.remove("text"); } // log additional errors if no new text and doc is flagged // such that it must have text (ocr or full text) boolean compareTexts = true; if ( this.config.isPagesArchive() ) { if (newTxt == null ) { this.txtLog.error(uri + ": is page data, but is missing page text in the new index."); this.txtErrorCount++; compareTexts = false; } if ( oldTxt == null ) { this.txtLog.error(uri + ": is page data, but is missing page text in the pages core."); this.txtErrorCount++; compareTexts = false; } } else { if (newTxt == null) { String val = doc.get("has_full_text").toString(); if (val.equalsIgnoreCase("false")) { this.txtLog.error(uri + ": field has_full_text is " + val + " but full text does not exist."); this.txtErrorCount++; compareTexts = false; } val = doc.get("is_ocr").toString(); if (val.equalsIgnoreCase("false")) { this.txtLog.error(uri + ": field is_ocr is " + val + " but full text does not exist."); this.txtErrorCount++; compareTexts = false; } } if (newTxt == null && oldTxt != null) { this.txtLog.error(uri + ":text field has disappeared from the new index. (old text size = " + oldTxt.length()); this.txtErrorCount++; compareTexts = false; } else if (newTxt != null && oldTxt == null) { this.txtLog.error(uri + ":text field has appeared in the new index."); this.txtErrorCount++; compareTexts = false; } } if ( compareTexts ) { if (newTxt.equals(oldTxt) == false) { newTxt = getProcessedReindexedText(newTxt); oldTxt = getProcessedOrigText(oldTxt); if (oldTxt.equals(newTxt) == false) { logMismatchedText(uri, oldTxt, newTxt); } } } } private void logMismatchedText(final String uri, final String oldTxt, final String newTxt) { int pos = StringUtils.indexOfDifference(newTxt, oldTxt); pos = Math.max(0, pos - 4); String newSub = newTxt.substring(pos, Math.min(pos + 51, newTxt.length())); String oldSub = oldTxt.substring(pos, Math.min(pos + 51, oldTxt.length())); this.txtLog.error("==== " + uri + " mismatch at line 0 col " + pos + ":"); this.txtLog.error("(new " + newTxt.length() + ")"); this.txtLog.error(newSub); this.txtLog.error("-- vs --"); this.txtLog.error("(old " + oldTxt.length() + ")"); this.txtLog.error(oldSub); this.txtLog.error("NEW: " + getBytesString(newSub)); this.txtLog.error("OLD: " + getBytesString(oldSub)); this.txtErrorCount++; } private String getBytesString(String text) { try { byte[] bytes = text.getBytes("UTF-8"); StringBuffer hexStr = new StringBuffer(); for (int i = 0; i < bytes.length; i++) { hexStr.append(Integer.toString(0xFF & bytes[i])).append(" "); if (hexStr.length() > 45) break; } return hexStr.toString(); } catch (Exception e) { addError("txt", "Invalid bytes in text: " + e.getMessage()); return "** ERROR **"; } } private String getProcessedOrigField(String origVal) { return removeExtraWhiteSpace(origVal); } private String getProcessedReindexedField(String origVal) { return removeExtraWhiteSpace(origVal); } private String getProcessedOrigText(String origTxt) { String val = origTxt.replaceAll("““", "“"); val = val.replaceAll("””", "””"); val = val.replaceAll("††", "†"); val = val.replaceAll("\\—+", "—"); return removeExtraWhiteSpace(val); } private String getProcessedReindexedText(String srcTxt) { String val = srcTxt.replaceAll("““", "“"); val = val.replaceAll("””", "””"); val = val.replaceAll("††", "†"); val = val.replaceAll("\\—+", "—"); return removeExtraWhiteSpace(val); } private String removeExtraWhiteSpace(final String srcTxt) { String result = srcTxt.replaceAll("\t", " "); // change tabs to spaces result = result.replaceAll("\\s+", " "); // get rid of multiple spaces result = result.replaceAll(" \n", "\n"); // get rid of trailing spaces result = result.replaceAll("\n ", "\n"); // get rid of leading spaces result = result.replaceAll("\\n+", " "); // get rid of lines return result; } /** * EXCEPTION case. Dont whine about fields we know are newly added * @param key * @return */ private boolean isIgnoredNewField(String key) { if (key.equals("date_created") || key.equals("date_updated")) { return true; } return false; } private boolean isIgnoredOldField(String key) { if (key.equals("batch")) { return true; } return false; } private void addError(String uri, String err) { addError(uri, err, false); } private void addError(String uri, String err, boolean tail) { if (this.errors.containsKey(uri) == false) { this.errors.put(uri, new ArrayList<String>()); } if (uri.equals("txt") || tail) { this.errors.get(uri).add(err); } else { this.errors.get(uri).add(0, err); } if (uri.equals("txt") == false) { this.errorCount++; } } /** * Ensure that all required fields are present and contain data * @param doc Document XML data * @throws Exception */ private void validateRequiredFields(JsonObject doc) { ArrayList<String> reqFields = REQUIRED_FIELDS; if ( this.config.isPagesArchive()) { reqFields = REQUIRED_PAGES_FIELDS; } for (String fieldName : reqFields ) { // find the first element in the correct doc that // has a name attribute matching the required field String uri = doc.get("uri").getAsString(); Object docField = doc.get(fieldName); // make sure field is present if (docField == null) { addError(uri, "required field: " + fieldName + " missing in new index"); } else { // if its an array, make sure it has children // and that the concatenated children content has length if (docField instanceof List) { @SuppressWarnings("unchecked") List<String> list = (List<String>) docField; String val = ""; for (String data : list) { val += data; } if (val.length() == 0) { addError(uri, "required ARR field: " + fieldName + " is all spaces in new index"); } } else { if (docField.toString().trim().length() == 0) { addError(uri, "required STR field: " + fieldName + " is all spaces in new index"); } } } } } /** * Log data to file and System.out * @param msg */ private void logInfo(final String msg) { log.info(msg); if (this.sysOut != null) { this.sysOut.println(msg); } else { System.out.println(msg); } } }