/*
* Copyright 2011
* Ubiquitous Knowledge Processing (UKP) Lab
* Technische Universität Darmstadt
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.tudarmstadt.ukp.dkpro.core.io.xml;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngineDescription;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader;
import static org.apache.uima.fit.pipeline.SimplePipeline.runPipeline;
import java.io.IOException;
import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.component.CasDumpWriter;
import org.junit.Test;
public class XPathXmlReaderIdValidationTest
{
private static final String VALID_DOCS_ROOT = "src/test/resources/input/valid_docs";
private static final String INVALID_DOCS_ROOT = "src/test/resources/input/invalid_docs";
// Valid docs
@Test
public void idValidationTest() throws UIMAException, IOException
{
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]**/abbr*.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_LANGUAGE, "en",
XmlXPathReader.PARAM_DOC_ID_TAG, "num"
);
// Should find two files
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/id_validation.txt"
);
runPipeline(reader, writer);
}
@Test
public void heteroFormatsIdValidationTest() throws UIMAException, IOException
{
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]full*.xml", "[+]abbr*.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/topic | /topics/top",
XmlXPathReader.PARAM_LANGUAGE, "en",
XmlXPathReader.PARAM_DOC_ID_TAG, "identifier | num"
);
// Should find two files
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/hetero_formats_id_validation.txt"
);
runPipeline(reader, writer);
}
@Test
public void attributeIdTest() throws UIMAException, IOException
{
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]attribute_id.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_DOC_ID_TAG, "@num"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/attribute_id.txt"
);
runPipeline(reader, writer);
}
@Test
public void deepTagIdTest() throws UIMAException, IOException
{
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]deep_tag_id.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_DOC_ID_TAG, "EN-title/num"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/deep_tag_id.txt"
);
runPipeline(reader, writer);
}
@Test
public void deepAttributeIdTest() throws UIMAException, IOException
{
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, VALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]deep_attribute_id.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_DOC_ID_TAG, "EN-title/@num"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/deep_attribute_id.txt"
);
runPipeline(reader, writer);
}
// Invalid docs
@Test(expected = IllegalArgumentException.class)
public void invalidSubstitutionParameterTest() throws UIMAException, IOException
{
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]*.*" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_SUBSTITUTE_TAGS, new String[] { "EN-title" }, // User should provide even number parameters
XmlXPathReader.PARAM_LANGUAGE, "en"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/invalid_subst_param.txt"
);
runPipeline(reader, writer);
}
@Test(expected = IllegalStateException.class)
public void emptyIdTest() throws UIMAException, IOException
{
// Doc contains ID tag but no value is provided within the tag.
// E.g. <num></num>
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]empty_id.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_DOC_ID_TAG, "num",
XmlXPathReader.PARAM_LANGUAGE, "en"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/empty_id.txt"
);
runPipeline(reader, writer);
}
@Test(expected = IllegalStateException.class)
public void noIdTagTest() throws UIMAException, IOException
{
// Doc doesn't contain ID tag at all
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]no_id_tag.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_DOC_ID_TAG, "num",
XmlXPathReader.PARAM_LANGUAGE, "en"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/no_id_tag.txt"
);
runPipeline(reader, writer);
}
@Test(expected = IllegalStateException.class)
public void nonUniqueIdTagTest() throws UIMAException, IOException
{
// A single doc contains ID tag twice
// E.g. <top>
// <num>01</num>
// <num>01</num>
// <title>.....
// ...
// </top>
CollectionReader reader = createReader(
XmlXPathReader.class,
XmlXPathReader.PARAM_SOURCE_LOCATION, INVALID_DOCS_ROOT,
XmlXPathReader.PARAM_PATTERNS, new String[] { "[+]duplicated_id_tags.xml" },
XmlXPathReader.PARAM_XPATH_EXPRESSION, "/topics/top",
XmlXPathReader.PARAM_DOC_ID_TAG, "num",
XmlXPathReader.PARAM_LANGUAGE, "en"
);
AnalysisEngineDescription writer = createEngineDescription(
CasDumpWriter.class,
CasDumpWriter.PARAM_OUTPUT_FILE, "target/output/duplicated_id_tags.txt"
);
runPipeline(reader, writer);
}
}