/* Copyright 2013 Fabian Steeg. Licensed under the Eclipse Public License 1.0 */
package org.lobid.lodmill;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.SortedSet;
import java.util.TreeSet;
import org.culturegraph.mf.morph.Metamorph;
import org.culturegraph.mf.morph.MorphErrorHandler;
import org.culturegraph.mf.stream.reader.MarcXmlReader;
import org.culturegraph.mf.stream.reader.Reader;
import org.culturegraph.mf.stream.sink.StringListMap;
import org.junit.Assert;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Ingest the hbz Marc21 export and log errors to debug issues with the format.
*
* @author Fabian Steeg (fsteeg)
*/
public final class HbzMarcIngest {
private static final Logger LOG =
LoggerFactory.getLogger(HbzMarcIngest.class);
private static final String HBZ_MARC = "../../hbz.cg.20120725.mrc";
private static final String REPORT_RAW = HBZ_MARC + "-report-raw.txt";
private static final String REPORT_PROCESSED =
HBZ_MARC + "-report-processed.txt";
private final Reader reader = new MarcXmlReader();
private final Metamorph metamorph = new Metamorph(Thread.currentThread()
.getContextClassLoader().getResourceAsStream("ingest.marc21.xml"));
private final SortedSet<String> errorSet = new TreeSet<String>();
private final Map<String, Integer> errorMap = new HashMap<String, Integer>();
@SuppressWarnings("javadoc")
@Test
public void ingest() throws IOException {
final StringListMap map = new StringListMap();
reader.setReceiver(metamorph).setReceiver(map);
final BufferedWriter rawReportWriter =
new BufferedWriter(new FileWriter(REPORT_RAW));
try {
metamorph.setErrorHandler(new MorphErrorHandler() {
@Override
public void error(final Exception exception) {
final String name = exception.getClass().getSimpleName();
final String errorMessage = String.format("Metamorph error (%s): %s",
name, exception.getMessage());
processError(rawReportWriter, name, errorMessage);
}
});
final BufferedReader scanner =
new BufferedReader(new FileReader(HBZ_MARC));
try {
int all = 0;
String line = null;
while ((line = scanner.readLine()) != null) { // NOPMD (idiomatic usage)
all++;
try {
reader.read(line);
} catch (Exception e) {
final String name = e.getClass().getSimpleName();
final String errorMessage =
String.format("Metastream error (%s): %s, record: %s", name,
e.getMessage(), line);
processError(rawReportWriter, name, errorMessage);
}
}
Assert.assertTrue("Raw report file should exist",
new File(REPORT_RAW).exists());
writeProcessedReport(scanner, all);
} finally {
scanner.close();
}
} finally {
rawReportWriter.close();
}
Assert.assertTrue("Processed report file should exist",
new File(REPORT_PROCESSED).exists());
}
private void writeProcessedReport(final BufferedReader scanner, final int all)
throws IOException {
final BufferedWriter processedReportWriter =
new BufferedWriter(new FileWriter(REPORT_PROCESSED));
try {
int err = 0;
for (Integer i : errorMap.values()) {
err += i;
}
final String summary =
String.format("Processed %s records, got %s errors:", all, err);
System.out.println(summary);
processedReportWriter.write(summary + "\n");
for (String s : errorMap.keySet()) {
System.out.println(String.format("%s: %s", s, errorMap.get(s)));
processedReportWriter
.write(String.format("%s: %s\n", s, errorMap.get(s)));
}
scanner.close();
for (String string : errorSet) {
processedReportWriter.write(string + "\n");
}
} finally {
processedReportWriter.close();
}
}
private void processError(final BufferedWriter fullReportWriter,
final String name, final String errorMEssage) {
LOG.error(errorMEssage);
errorSet.add(errorMEssage);
errorMap.put(name,
(errorMap.containsKey(name) ? errorMap.get(name) : 0) + 1);
try {
fullReportWriter.write(errorMEssage + "\n");
} catch (IOException e) {
LOG.error(e.getMessage(), e);
}
}
}