package org.solrmarc.marc; import java.io.*; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Properties; import org.apache.log4j.Logger; import org.marc4j.*; import org.solrmarc.marc.MarcUnprettyXmlReader; import org.solrmarc.tools.PropertyUtils; public class SolrMarcMarcReaderFactory { protected boolean verbose = false; /** The full class name of SolrIndexer or the subclass to be used */ //protected Properties configProps; protected boolean inputTypeXML = false; protected boolean inputTypeBinary = false; protected boolean inputTypeJSON = false; protected boolean includeErrors = false; protected boolean permissiveReader; protected String defaultEncoding; protected boolean to_utf_8; protected String combineConsecutiveRecordsFields = null; protected String unicodeNormalize = null; // Initialize logging category static Logger logger = Logger.getLogger(SolrMarcMarcReaderFactory.class.getName()); private SolrMarcMarcReaderFactory() { } static SolrMarcMarcReaderFactory theFactory = new SolrMarcMarcReaderFactory(); public static SolrMarcMarcReaderFactory instance() { return(theFactory); } public MarcReader makeReader(Properties config, String[] searchDirectories, String ... inputFilenames) { if (inputFilenames.length == 0) { return makeReader(config, searchDirectories, "stdin"); } else if (inputFilenames.length == 1) { return makeReader(config, searchDirectories, inputFilenames[0]); } List<MarcReader> readers = new ArrayList<>(); for (String inputFilename : inputFilenames) { MarcReader reader = makeReader(config, searchDirectories, inputFilename); readers.add(reader); } return(new MarcMultiplexReader(readers, Arrays.asList(inputFilenames))); } public MarcReader makeReader(Properties config, String[] searchDirectories, List<String> inputFilenames) { if (inputFilenames.size() == 0) { return makeReader(config, searchDirectories, "stdin"); } else if (inputFilenames.size() == 1) { return makeReader(config, searchDirectories, inputFilenames.iterator().next()); } List<MarcReader> readers = new ArrayList<>(); for (String inputFilename : inputFilenames) { MarcReader reader = makeReader(config, searchDirectories, inputFilename); readers.add(reader); } return(new MarcMultiplexReader(readers, inputFilenames)); } public MarcReader makeReader(Properties config, String[] searchDirectories, String inputFilename) { InputStream is; if (inputFilename.equals("-") || inputFilename.equals("stdin")) { is = new BufferedInputStream(System.in); } else { try { is = new BufferedInputStream(new FileInputStream(inputFilename)); } catch (FileNotFoundException e) { logger.error("Fatal error: Exception opening InputStream: " + inputFilename); throw new IllegalArgumentException("Fatal error: Exception opening InputStream" + inputFilename); } } return(makeReader(config, searchDirectories, is)); } public MarcReader makeReader(Properties config, String[] searchDirectories, InputStream input) { MarcReader reader; setMarc4JProperties(config); combineConsecutiveRecordsFields = PropertyUtils.getProperty(config, "marc.combine_records"); if (combineConsecutiveRecordsFields != null && combineConsecutiveRecordsFields.length() == 0) combineConsecutiveRecordsFields = null; permissiveReader = Boolean.parseBoolean(PropertyUtils.getProperty(config, "marc.permissive")); if (PropertyUtils.getProperty(config, "marc.default_encoding") != null) { defaultEncoding = PropertyUtils.getProperty(config, "marc.default_encoding").trim(); } else { defaultEncoding = "BESTGUESS"; } // verbose = Boolean.parseBoolean(PropertyUtils.getProperty(configProps, "marc.verbose")); includeErrors = Boolean.parseBoolean(PropertyUtils.getProperty(config, "marc.include_errors")); to_utf_8 = Boolean.parseBoolean(PropertyUtils.getProperty(config, "marc.to_utf_8")); unicodeNormalize = PropertyUtils.getProperty(config, "marc.unicode_normalize"); if (unicodeNormalize != null) { unicodeNormalize = handleUnicodeNormalizeParm(unicodeNormalize); } InputStream is; if (input.markSupported()) { is = input; } else { is = new BufferedInputStream(input); } is.mark(20); byte[] buffer = new byte[15]; @SuppressWarnings("unused") int numRead; try { numRead = is.read(buffer); is.reset(); } catch (IOException e) { logger.error("Fatal error: Exception reading from InputStream"); throw new IllegalArgumentException("Fatal error: Exception reading from InputStream"); } String filestart = new String(buffer); inputTypeXML = false; inputTypeBinary = false; inputTypeJSON = false; if (numRead == -1 || filestart.length() == 0) inputTypeBinary = true; else if (filestart.substring(0, 5).equalsIgnoreCase("<?xml")) inputTypeXML = true; else if (filestart.startsWith("{")) inputTypeJSON = true; else if (filestart.substring(0, 5).matches("\\d\\d\\d\\d\\d")) inputTypeBinary = true; else if (filestart.contains("<?xml") || filestart.contains("<?XML")) inputTypeXML = true; else if (filestart.contains("<collection")) inputTypeXML = true; else if (filestart.contains("<record")) inputTypeXML = true; else if (filestart.contains("<!--")) inputTypeXML = true; if (inputTypeXML) { to_utf_8 = true; reader = new MarcUnprettyXmlReader(is); } else if (inputTypeJSON) { to_utf_8 = true; reader = new MarcJsonReader(is); } else if (inputTypeBinary && permissiveReader) { reader = new MarcPermissiveStreamReader(is, true, to_utf_8, defaultEncoding); } else if (inputTypeBinary) { reader = new MarcPermissiveStreamReader(is, false, to_utf_8, defaultEncoding); } else { logger.error("Fatal error: Unable to determine type of inputfile"); throw new IllegalArgumentException("Fatal error: Unable to determine type of inputfile. File starts with: "+ filestart); } // Add Combine Record reader if requested if (reader != null && combineConsecutiveRecordsFields != null) { String combineLeftField = PropertyUtils.getProperty(config, "marc.combine_records.left_field"); String combineRightField = PropertyUtils.getProperty(config, "marc.combine_records.right_field"); reader = new MarcCombiningReader(reader, combineConsecutiveRecordsFields, combineLeftField, combineRightField); } // Add FilteredReader if requested String marcIncludeIfPresent = PropertyUtils.getProperty(config, "marc.include_if_present"); String marcIncludeIfMissing = PropertyUtils.getProperty(config, "marc.include_if_missing"); String marcDeleteSubfields = PropertyUtils.getProperty(config, "marc.delete_subfields"); if (marcDeleteSubfields != null && marcDeleteSubfields.equals("nomap")) marcDeleteSubfields = null; String marcRemapRecord = PropertyUtils.getProperty(config, "marc.reader.remap"); if (marcRemapRecord != null && marcRemapRecord.equals("nomap")) marcRemapRecord = null; if (marcDeleteSubfields != null) marcDeleteSubfields = marcDeleteSubfields.trim(); if (reader != null && (marcIncludeIfPresent != null || marcIncludeIfMissing != null || marcDeleteSubfields != null || marcRemapRecord != null)) { if (marcRemapRecord != null) { String remapFilename = marcRemapRecord.trim(); // String configFilePath = PropertyUtils.getProperty(config, "config.file.dir"); // String propertySearchPath[] = PropertyUtils.makePropertySearchPath(solrmarcPath, siteSpecificPath, configFilePath, homeDir); String remapURL = PropertyUtils.getPropertyFileAbsoluteURL(searchDirectories, remapFilename, false, null); reader = new MarcFilteredReader(reader, marcIncludeIfPresent, marcIncludeIfMissing, marcDeleteSubfields, remapURL); } else { reader = new MarcFilteredReader(reader, marcIncludeIfPresent, marcIncludeIfMissing, marcDeleteSubfields); } } // Do translating last so that if we are Filtering as well as translating, we don't expend the // effort to translate records, which may then be filtered out and discarded. if (reader != null && to_utf_8 && unicodeNormalize != null) { reader = new MarcTranslatedReader(reader, unicodeNormalize); } return reader; } private void setMarc4JProperties(Properties configProps) { for (String prop : configProps.stringPropertyNames()) { if (prop.startsWith("org.marc4j.")) { String value = configProps.getProperty(prop); System.setProperty(prop, value); } else if (PropertyUtils.getProperty(configProps, "marc.override")!= null) { System.setProperty("org.marc4j.marc.MarcFactory", PropertyUtils.getProperty(configProps, "marc.override").trim()); } } } // We only get here if the parm (unicodeNormalize2) is not null compare it against // the valid values and return the correct value to use as the parm private String handleUnicodeNormalizeParm(String parm) { if (parm == null) return(null); if (parm.equalsIgnoreCase("KC") || parm.equalsIgnoreCase("CompatibilityCompose")) { parm = "KC"; } else if (parm.equalsIgnoreCase("C") || parm.equalsIgnoreCase("Compose") || parm.equalsIgnoreCase("true")) { parm = "C"; } else if (parm.equalsIgnoreCase("D") || parm.equalsIgnoreCase("Decompose")) { parm = "D"; } else if (parm.equalsIgnoreCase("KD") || parm.equalsIgnoreCase("CompatibiltyDecompose")) { parm = "KD"; } else { parm = null; } return(parm); } }