/* * Copyright 2016 Christoph Böhme * * Licensed under the Apache License, Version 2.0 the "License"; * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.culturegraph.mf.biblio.marc21; import org.culturegraph.mf.biblio.iso2709.FieldHandler; import org.culturegraph.mf.biblio.iso2709.Record; import org.culturegraph.mf.biblio.iso2709.RecordFormat; import org.culturegraph.mf.framework.FluxCommand; import org.culturegraph.mf.framework.FormatException; import org.culturegraph.mf.framework.MissingIdException; import org.culturegraph.mf.framework.StreamReceiver; import org.culturegraph.mf.framework.annotations.Description; import org.culturegraph.mf.framework.annotations.In; import org.culturegraph.mf.framework.annotations.Out; import org.culturegraph.mf.framework.helpers.DefaultObjectPipe; /** * Decodes MARC 21 records into an event stream. This decoder only processes * single records. Input data containing multiple records must be split into * individual records before passing it to this decoder. * <p> * This decoder extracts the following parts from MARC 21 records: * <ul> * <li>bibliographic information in the record leader, * <li>record identifier, * <li>control fields, * <li>data fields. * </ul> * This decoder only supports MARC 21 records with UTF-8 encoding. Other * character coding schemes are not supported. A {@link FormatException} is * thrown if a record with an unsupported coding scheme is encountered. * <p> * The bibliographic information in the record leader is * <ul> * <li>record status, * <li>record type, * <li>bibliographic level, * <li>type of control, * <li>character coding scheme, * <li>encoding level, * <li>descriptive cataloging form, * <li>multipart resource record level. * </ul> * This information is emitted as an entity named * "{@value Marc21EventNames#LEADER_ENTITY}". It is emitted directly * after the <i>start-record</i> event. The entity contains the following * literals: * <ol> * <li>{@value Marc21EventNames#RECORD_STATUS_LITERAL} * <li>{@value Marc21EventNames#RECORD_TYPE_LITERAL} * <li>{@value Marc21EventNames#BIBLIOGRAPHIC_LEVEL_LITERAL} * <li>{@value Marc21EventNames#TYPE_OF_CONTROL_LITERAL} * <li>{@value Marc21EventNames#CHARACTER_CODING_LITERAL} * <li>{@value Marc21EventNames#ENCODING_LEVEL_LITERAL} * <li>{@value Marc21EventNames#CATALOGING_FORM_LITERAL} * <li>{@value Marc21EventNames#MULTIPART_LEVEL_LITERAL} * </ol> * The literals are emitted in the order in which they are listed here. The * values of these literals are the characters at the corresponding * positions in the record leader (see * <a href="http://www.loc.gov/marc/bibliographic/bdleader.html">MARC 21 * Standard: Record Leader</a> for a description of the allowed values). The * literal values are always only single characters. As this decoder only * supports MARC 21 records with UTF-8 encoding, the value of the <i>literal * "{@value Marc21EventNames#CHARACTER_CODING_LITERAL}"</i> will * always be "a". * <p> * For example, given a record with the leader * <pre> * 00128noa a2200073zu 4500 * </pre> * the following event stream will be emitted: * <pre> * start-record "1" * start-entity "{@value Marc21EventNames#LEADER_ENTITY}" * literal "{@value Marc21EventNames#RECORD_STATUS_LITERAL}": n * literal "{@value Marc21EventNames#RECORD_TYPE_LITERAL}": o * literal "{@value Marc21EventNames#BIBLIOGRAPHIC_LEVEL_LITERAL}": a * literal "{@value Marc21EventNames#TYPE_OF_CONTROL_LITERAL}": " " * literal "{@value Marc21EventNames#CHARACTER_CODING_LITERAL}": a * literal "{@value Marc21EventNames#ENCODING_LEVEL_LITERAL}": z * literal "{@value Marc21EventNames#CATALOGING_FORM_LITERAL}": u * literal "{@value Marc21EventNames#MULTIPART_LEVEL_LITERAL}": " " * end-entity * … * </pre> * The record identifier is taken from field "001". It is used as * identifier in the <i>start-record</i> event. Additionally, it is emitted as * a control field (since that is what it is technically). The behaviour of * the decoder if a record has no identifier can be configured through the * {@link #setIgnoreMissingId(boolean)} parameter. * <p> * Control fields are emitted as literals with their tag as literal name and * their field value as literal value. * <p> * Data fields are emitted as entities. The entity name consists of the tag * followed by the two indicator characters of the field. For each sub field * in the data field a <i>literal</i> event is emitted. The literal name is * the identifier character of the sub field and the literal value is the data * value of the sub field. * <p> * All fields are emitted in the order in which they appear in the directory * of the MARC 21 record. For overlong fields which have multiple directory * entries (see section 4.4.4 in the ISO 2709:2008 standard), only the tag * from the first entry is used, the remaining ones are ignored. * <p> * Empty control fields and sub fields are emitted as literals with an empty * value. Data fields without sub fields produce only a <i>start-entity</i> * and an <i>end-entity</i> event without any <i>literal</i> events in-between. * If the decoder receives an empty input string it is ignored and no stream * events are emitted. * <p> * If an error occurs during decoding, a {@link FormatException} is thrown. * * @author Christoph Böhme * @see "ISO 2709:2008 Standard" * @see <a href="http://www.loc.gov/marc/specifications/spechome.html">MARC-21 * Standards</a> */ @In(String.class) @Out(StreamReceiver.class) @Description("Decodes MARC 21 records") @FluxCommand("decode-marc21") public final class Marc21Decoder extends DefaultObjectPipe<String, StreamReceiver> { private final FieldHandler fieldHandler = new Marc21Handler(); private boolean ignoreMissingId; /** * Controls whether the decoder aborts processing if a record has no * identifier. A {@link MissingIdException} is thrown in these cases. * If this parameter is set to true then the identifier emitted with the * <i>start-record</i> event of records without field "001" will * be an empty string. * <p> * The default value of {@code ignoreMissingId} is false. * <p> * This parameter can be changed anytime during processing. The new value * becomes effective with the next record being processed. * * @param ignoreMissingId * true if missing identifiers should be silently ignored. */ public void setIgnoreMissingId(final boolean ignoreMissingId) { this.ignoreMissingId = ignoreMissingId; } public boolean getIgnoreMissingId() { return ignoreMissingId; } @Override public void process(final String obj) { if (obj.isEmpty()) { return; } final Record record = new Record(obj.getBytes(Marc21Constants.MARC21_CHARSET)); record.setCharset(Marc21Constants.MARC21_CHARSET); requireMarc21RecordFormat(record.getRecordFormat()); requireUTF8Encoding(record); getReceiver().startRecord(tryGetRecordId(record)); emitLeader(record); record.processFields(fieldHandler); getReceiver().endRecord(); } private void requireMarc21RecordFormat(final RecordFormat format) { if (!Marc21Constants.MARC21_FORMAT.equals(format)) { throw new FormatException("invalid record format. Expected " + Marc21Constants.MARC21_FORMAT + " but got " + format); } } private void requireUTF8Encoding(final Record record) { if (record.getImplCodes()[Marc21Constants.CHARACTER_CODING_INDEX] != 'a') { throw new FormatException( "invalid record encoding. Only UTF-8 is supported"); } } private String tryGetRecordId(final Record record) { final String id = record.getRecordId(); if (id == null) { if (!ignoreMissingId) { throw new MissingIdException("record has no id"); } return ""; } return id; } private void emitLeader(final Record record) { final char[] implCodes = record.getImplCodes(); final char[] systemChars = record.getSystemChars(); getReceiver().startEntity(Marc21EventNames.LEADER_ENTITY); getReceiver().literal(Marc21EventNames.RECORD_STATUS_LITERAL, String.valueOf( record.getRecordStatus())); getReceiver().literal(Marc21EventNames.RECORD_TYPE_LITERAL, String.valueOf( implCodes[Marc21Constants.RECORD_TYPE_INDEX])); getReceiver().literal(Marc21EventNames.BIBLIOGRAPHIC_LEVEL_LITERAL, String.valueOf( implCodes[Marc21Constants.BIBLIOGRAPHIC_LEVEL_INDEX])); getReceiver().literal(Marc21EventNames.TYPE_OF_CONTROL_LITERAL, String.valueOf( implCodes[Marc21Constants.TYPE_OF_CONTROL_INDEX])); getReceiver().literal(Marc21EventNames.CHARACTER_CODING_LITERAL, String.valueOf( implCodes[Marc21Constants.CHARACTER_CODING_INDEX])); getReceiver().literal(Marc21EventNames.ENCODING_LEVEL_LITERAL, String.valueOf( systemChars[Marc21Constants.ENCODING_LEVEL_INDEX])); getReceiver().literal(Marc21EventNames.CATALOGING_FORM_LITERAL, String.valueOf( systemChars[Marc21Constants.CATALOGING_FORM_INDEX])); getReceiver().literal(Marc21EventNames.MULTIPART_LEVEL_LITERAL, String.valueOf( systemChars[Marc21Constants.MULTIPART_LEVEL_INDEX])); getReceiver().endEntity(); } /** * Emits the fields in a MARC 21 record as stream events. */ private final class Marc21Handler implements FieldHandler { @Override public void referenceField(final char[] tag, final char[] implDefinedPart, final String value) { getReceiver().literal(String.valueOf(tag), value); } @Override public void startDataField(final char[] tag, final char[] implDefinedPart, final char[] indicators) { getReceiver().startEntity(buildName(tag, indicators)); } private String buildName(final char[] tag, final char[] indicators) { final int nameLength = tag.length + indicators.length; final char[] name = new char[nameLength]; System.arraycopy(tag, 0, name, 0, tag.length); System.arraycopy(indicators, 0, name, tag.length, indicators.length); return String.valueOf(name); } @Override public void endDataField() { getReceiver().endEntity(); } @Override public void additionalImplDefinedPart(final char[] implDefinedPart) { // Nothing to do. MARC 21 does not use implementation defined parts. } @Override public void data(final char[] identifier, final String value) { getReceiver().literal(String.valueOf(identifier[0]), value); } } }