/*
* Copyright 2016 Christoph Böhme
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.culturegraph.mf.biblio.pica;
import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoMoreInteractions;
import org.culturegraph.mf.framework.MissingIdException;
import org.culturegraph.mf.framework.StreamReceiver;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.mockito.InOrder;
import org.mockito.Mock;
import org.mockito.MockitoAnnotations;
/**
* Tests for class {@link PicaDecoder}.
*
* @author Christoph Böhme
*
*/
public final class PicaDecoderTest {
private static final String RECORD_ID = "2809";
private static final String ENTITY_028A = "028A";
private static final String NAME_A = "a";
private static final String NAME_D = "d";
private static final String VALUE_A = "Eco";
private static final String VALUE_D = "Umberto";
private static final String COMPOSED_UTF8 = "Über"; // 'Ü' constructed from U and diacritics
private static final String STANDARD_UTF8 = "Über"; // 'Ü' is a single character
private static final String RECORD_MARKER = "\u001d";
private static final String FIELD_MARKER = "\u001e";
private static final String SUBFIELD_MARKER = "\u001f";
private static final String FIELD_END_MARKER = "\n";
private static final String FIELD_001AT_0_TEST = "001@ " + SUBFIELD_MARKER + "0test";
private static final String FIELD_003AT_0_ID = "003@ " + SUBFIELD_MARKER + "0" + RECORD_ID;
private static final String FIELD_107F_0_ID = "107F " + SUBFIELD_MARKER + "0" + RECORD_ID;
private static final String FIELD_203AT_0_ID = "203@ " + SUBFIELD_MARKER + "0" + RECORD_ID;
private static final String FIELD_203AT_01_0_ID = "203@/01 " + SUBFIELD_MARKER + "0" + RECORD_ID;
private static final String FIELD_203AT_100_0_ID = "203@/100 " + SUBFIELD_MARKER + "0" + RECORD_ID;
private static final String FIELD_021A_A_UEBER = "021A " + SUBFIELD_MARKER + "a" + COMPOSED_UTF8;
private static final String FIELD_028A = ENTITY_028A + " ";
private PicaDecoder picaDecoder;
@Mock
private StreamReceiver receiver;
@Before
public void setup() {
MockitoAnnotations.initMocks(this);
picaDecoder = new PicaDecoder();
picaDecoder.setReceiver(receiver);
}
@After
public void cleanup() {
picaDecoder.closeStream();
}
@Test
public void shouldParseRecordStartingWithRecordMarker() {
picaDecoder.process(
RECORD_MARKER + FIELD_001AT_0_TEST +
FIELD_MARKER + FIELD_003AT_0_ID);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify001At0Test(ordered);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordStartingWithFieldMarker() {
picaDecoder.process(
FIELD_MARKER + FIELD_001AT_0_TEST +
FIELD_MARKER + FIELD_003AT_0_ID);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify001At0Test(ordered);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordStartingWithSubfieldMarker() {
picaDecoder.process(
SUBFIELD_MARKER + NAME_A + VALUE_A +
FIELD_MARKER + FIELD_003AT_0_ID);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
ordered.verify(receiver).startEntity("");
ordered.verify(receiver).literal(NAME_A, VALUE_A);
ordered.verify(receiver).endEntity();
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordStartingWithEmptySubfield() {
picaDecoder.process(
SUBFIELD_MARKER +
FIELD_MARKER + FIELD_003AT_0_ID);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordStartingWithFieldEndMarker() {
picaDecoder.process(
FIELD_END_MARKER + FIELD_001AT_0_TEST +
FIELD_MARKER + FIELD_003AT_0_ID);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify001At0Test(ordered);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordStartingWithFieldName() {
picaDecoder.process(
FIELD_001AT_0_TEST +
FIELD_MARKER + FIELD_003AT_0_ID);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify001At0Test(ordered);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordEndingWithRecordMarker() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_001AT_0_TEST + RECORD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
verify001At0Test(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordEndingWithFieldMarker() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_001AT_0_TEST + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
verify001At0Test(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordEndingWithSubfieldMarker() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A +
SUBFIELD_MARKER + NAME_A + VALUE_A +
SUBFIELD_MARKER + NAME_D + VALUE_D +
SUBFIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity(ENTITY_028A);
ordered.verify(receiver).literal(NAME_A, VALUE_A);
ordered.verify(receiver).literal(NAME_D, VALUE_D);
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordEndingWithSubfieldName() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A +
SUBFIELD_MARKER + NAME_A + VALUE_A +
SUBFIELD_MARKER + NAME_D);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity(ENTITY_028A);
ordered.verify(receiver).literal(NAME_A, VALUE_A);
ordered.verify(receiver).literal(NAME_D, "");
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseRecordEndingWithFieldName() {
// Do not skip the last field because it has no
// sub fields:
picaDecoder.setSkipEmptyFields(false);
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity(ENTITY_028A);
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
}
@Test
public void shouldParseMultiLineRecordFormat() {
picaDecoder.process(
RECORD_MARKER + FIELD_END_MARKER +
FIELD_MARKER + FIELD_001AT_0_TEST + FIELD_END_MARKER +
FIELD_MARKER + FIELD_003AT_0_ID + FIELD_END_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify001At0Test(ordered);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldExtractPicaProductionNumberAfterRecordMarkerAsRecordId() {
picaDecoder.process(RECORD_MARKER + FIELD_003AT_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberAfterFieldMarkerAsRecordId() {
picaDecoder.process(FIELD_MARKER + FIELD_003AT_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberAfterFieldEndMarkerAsRecordId() {
picaDecoder.process(FIELD_END_MARKER + FIELD_003AT_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberFollowedByRecordMarkerAsRecordId() {
picaDecoder.process(FIELD_003AT_0_ID + RECORD_MARKER);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberFollowedByFieldMarkerAsRecordId() {
picaDecoder.process(FIELD_003AT_0_ID + FIELD_MARKER);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberFollowedBySubfieldMarkerAsRecordId() {
picaDecoder.process(FIELD_003AT_0_ID + SUBFIELD_MARKER);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberFollowedByFieldEndMarkerAsRecordId() {
picaDecoder.process(FIELD_003AT_0_ID + FIELD_END_MARKER);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractPicaProductionNumberAtRecordEndAsRecordId() {
picaDecoder.process(FIELD_003AT_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractLocalProductionNumberAsRecordId() {
picaDecoder.process(FIELD_107F_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractCopyControlNumberAsRecordId() {
picaDecoder.process(FIELD_203AT_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractCopyControlNumberWithOccurrenceAsRecordId() {
picaDecoder.process(FIELD_203AT_01_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test
public void shouldExtractCopyControlNumberWithThreeDigitOccurrenceAsRecordId() {
picaDecoder.process(FIELD_203AT_100_0_ID);
verify(receiver).startRecord(RECORD_ID);
}
@Test(expected=MissingIdException.class)
public void shouldThrowMissingIdExceptionIfNoRecordIdIsFound() {
picaDecoder.process(FIELD_001AT_0_TEST);
// Exception expected
}
@Test
public void shouldIgnoreMatchWithinFieldData() {
picaDecoder.setIgnoreMissingIdn(true);
picaDecoder.process(FIELD_001AT_0_TEST + FIELD_003AT_0_ID);
verify(receiver).startRecord("");
}
@Test
public void shouldIgnoreIncompleteMatch() {
picaDecoder.setIgnoreMissingIdn(true);
picaDecoder.process("003@ " + FIELD_MARKER + FIELD_001AT_0_TEST);
verify(receiver).startRecord("");
}
@Test
public void shouldSkipUnnamedFieldsWithNoSubFields() {
// Make sure that the field is skipped because
// it is empty and not because it has no sub
// fields:
picaDecoder.setSkipEmptyFields(false);
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
verifyNoMoreInteractions(receiver);
}
@Test
public void shouldSkipUnnamedFieldsWithOnlyUnnamedSubFields() {
// Make sure that the field is skipped because
// it is empty and not because it only has empty
// sub fields:
picaDecoder.setSkipEmptyFields(false);
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
SUBFIELD_MARKER + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
verifyNoMoreInteractions(receiver);
}
@Test
public void shouldNotSkipUnnamedFieldsWithSubFields() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
SUBFIELD_MARKER + NAME_A + VALUE_A +
FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity("");
ordered.verify(receiver).literal(NAME_A, VALUE_A);
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
}
@Test
public void shouldSkipUnnamedSubfields() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A +
SUBFIELD_MARKER +
SUBFIELD_MARKER + NAME_A + VALUE_A +
FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity(ENTITY_028A);
ordered.verify(receiver).literal(NAME_A, VALUE_A);
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
verifyNoMoreInteractions(receiver);
}
@Test
public void shouldSkipEmptyFieldsByDefault() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
verifyNoMoreInteractions(receiver);
}
@Test
public void shouldSkipFieldsWithOnlyUnnamedSubfieldsByDefault() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A +
SUBFIELD_MARKER +
FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).endRecord();
verifyNoMoreInteractions(receiver);
}
@Test
public void shouldNotSkipEmptyFieldsIfConfigured() {
picaDecoder.setSkipEmptyFields(false);
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity(ENTITY_028A);
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
}
@Test
public void shouldNotSkipFieldsWithOnlyUnnamedSubfieldsIfConfigured() {
picaDecoder.setSkipEmptyFields(false);
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_028A +
SUBFIELD_MARKER +
FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
ordered.verify(receiver).startEntity(ENTITY_028A);
ordered.verify(receiver).endEntity();
ordered.verify(receiver).endRecord();
}
@Test(expected=MissingIdException.class)
public void shouldFailIfIdIsMissingByDefault() {
picaDecoder.process(
FIELD_001AT_0_TEST + FIELD_MARKER);
}
@Test
public void shouldIgnoreMissingIdIfConfigured() {
picaDecoder.setIgnoreMissingIdn(true);
picaDecoder.process(
FIELD_001AT_0_TEST + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord("");
verify001At0Test(ordered);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldNotNormalizeUTF8ByDefault() {
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_021A_A_UEBER + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
verify021AAUeber(ordered, COMPOSED_UTF8);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldNormalizeUTF8IfConfigured() {
picaDecoder.setNormalizeUTF8(true);
picaDecoder.process(
FIELD_003AT_0_ID + FIELD_MARKER +
FIELD_021A_A_UEBER + FIELD_MARKER);
final InOrder ordered = inOrder(receiver);
ordered.verify(receiver).startRecord(RECORD_ID);
verify003At0ID(ordered);
verify021AAUeber(ordered, STANDARD_UTF8);
ordered.verify(receiver).endRecord();
}
@Test
public void shouldTrimWhitespaceInFieldNamesByDefault() {
picaDecoder.process(
" fieldname " + SUBFIELD_MARKER + "0subfield" +
FIELD_MARKER + FIELD_003AT_0_ID);
verify(receiver).startEntity("fieldname");
}
@Test
public void shouldNotTrimWhitespaceInFieldNamesIfConfigured() {
picaDecoder.setTrimFieldNames(false);
picaDecoder.process(
" fieldname " + SUBFIELD_MARKER + "0subfield" +
FIELD_MARKER + FIELD_003AT_0_ID);
verify(receiver).startEntity(" fieldname ");
}
private void verify003At0ID(final InOrder ordered) {
ordered.verify(receiver).startEntity("003@");
ordered.verify(receiver).literal("0", RECORD_ID);
ordered.verify(receiver).endEntity();
}
private void verify001At0Test(final InOrder ordered) {
ordered.verify(receiver).startEntity("001@");
ordered.verify(receiver).literal("0", "test");
ordered.verify(receiver).endEntity();
}
private void verify021AAUeber(final InOrder ordered, final String value) {
ordered.verify(receiver).startEntity("021A");
ordered.verify(receiver).literal("a", value);
ordered.verify(receiver).endEntity();
}
}