/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.pig.piggybank.test.storage;
import static org.apache.pig.ExecType.LOCAL;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Random;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import junit.framework.TestCase;
import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.MRConfiguration;
import org.apache.pig.data.Tuple;
import org.apache.pig.test.Util;
import org.w3c.dom.Document;
public class TestXMLLoader extends TestCase {
public static ArrayList<String[]> data = new ArrayList<String[]>();
static {
data.add(new String[] { "<configuration>"});
data.add(new String[] { "<property>"});
data.add(new String[] { "<name> foobar </name>"});
data.add(new String[] { "<value> barfoo </value>"});
data.add(new String[] { "</property>"});
data.add(new String[] { "<ignoreProperty>"});
data.add(new String[] { "<name> foo </name>"});
data.add(new String[] { "</ignoreProperty>"});
data.add(new String[] { "<property>"});
data.add(new String[] { "<name> justname </name>"});
data.add(new String[] { "</property>"});
data.add(new String[] { "</configuration>"});
}
public static ArrayList<String[]> nestedTags = new ArrayList<String[]>();
static {
nestedTags.add(new String[] { "<events>"});
nestedTags.add(new String[] { "<event id='116913365'>"});
nestedTags.add(new String[] { "<eventRank>1.000000000000</eventRank>"});
nestedTags.add(new String[] { "<name>XY</name>"});
nestedTags.add(new String[] { "<relatedEvents>"});
nestedTags.add(new String[] { "<event id='116913365'>x</event>"});
nestedTags.add(new String[] { "<event id='116913365'>y</event>"});
nestedTags.add(new String[] { "</relatedEvents>"});
nestedTags.add(new String[] { "</event>"});
nestedTags.add(new String[] { "<event id='116913365'>"});
nestedTags.add(new String[] { "<eventRank>3.0000</eventRank>"});
nestedTags.add(new String[] { "<name>AB</name>"});
nestedTags.add(new String[] { "<relatedEvents>"});
nestedTags.add(new String[] { "<event id='116913365'>a</event>"});
nestedTags.add(new String[] { "<event id='116913365'>b</event>"});
nestedTags.add(new String[] { "</relatedEvents>"});
nestedTags.add(new String[] { "</event>"});
nestedTags.add(new String[] { "<event>"});
nestedTags.add(new String[] { "<eventRank>4.0000</eventRank>"});
nestedTags.add(new String[] { "<name>CD</name>"});
nestedTags.add(new String[] { "<relatedEvents>"});
nestedTags.add(new String[] { "<event>c</event>"});
nestedTags.add(new String[] { "<event>d</event>"});
nestedTags.add(new String[] { "</relatedEvents>"});
nestedTags.add(new String[] { "</event>"});
nestedTags.add(new String[] { "</events>"});
}
public static ArrayList<String[]> inlineClosedTags = new ArrayList<String[]>();
static {
inlineClosedTags.add(new String[] { "<events>"});
inlineClosedTags.add(new String[] { "<event id='3423'/>"});
inlineClosedTags.add(new String[] { "<event/>"});
inlineClosedTags.add(new String[] { "<event><event/></event>"});
inlineClosedTags.add(new String[] { "<event id='33'><tag k='a' v='b'/></event>"});
inlineClosedTags.add(new String[] { "</events>"});
}
public static ArrayList<String[]> indentedXmlWithMultilineLineContent = new ArrayList<String[]>();
static {
indentedXmlWithMultilineLineContent.add(new String[] { " <page>You have " });
indentedXmlWithMultilineLineContent.add(new String[] { "not missed it</page>" });
}
public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception {
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('invalid') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(0, tupleCount);
}
public void testLoadXMLLoader() throws Exception {
//ArrayList<DataByteArray[]> expected = TestHelper.getExpected(data, pattern);
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(2, tupleCount);
}
public void testXMLLoaderShouldLoadBasicBzip2Files() throws Exception {
String filename = TestHelper.createTempFile(data, "");
Process bzipProc = Runtime.getRuntime().exec("bzip2 "+filename);
int waitFor = bzipProc.waitFor();
if(waitFor != 0) {
fail ("Failed to create the class");
}
filename = filename + ".bz2";
try {
PigServer pigServer = new PigServer (ExecType.LOCAL);
String loadQuery = "A = LOAD '" + Util.encodeEscape(filename) + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
pigServer.registerQuery(loadQuery);
Iterator<Tuple> it = pigServer.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
//TestHelper.examineTuple(expected, tuple, tupleCount);
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(2, tupleCount);
} finally {
new File(filename).delete();
}
}
public void testLoaderShouldLoadBasicGzFile() throws Exception {
String filename = TestHelper.createTempFile(data, "");
Process bzipProc = Runtime.getRuntime().exec("gzip "+filename);
int waitFor = bzipProc.waitFor();
if(waitFor != 0) {
fail ("Failed to create the class");
}
filename = filename + ".gz";
try {
PigServer pigServer = new PigServer (ExecType.LOCAL);
String loadQuery = "A = LOAD '" + Util.encodeEscape(filename) + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
pigServer.registerQuery(loadQuery);
Iterator<Tuple> it = pigServer.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(2, tupleCount);
} finally {
new File(filename).delete();
}
}
public void testXMLLoaderShouldNotConfusedWithTagsHavingSimilarPrefix () throws Exception {
ArrayList<String[]> testData = new ArrayList<String[]>();
testData.add(new String[] { "<namethisalso> foobar9 </namethisalso>"});
testData.addAll(data);
String filename = TestHelper.createTempFile(testData, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('name') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(3, tupleCount);
}
public void testShouldReturn1ForIntermediateTagData () throws Exception {
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(1, tupleCount);
}
public void testShouldReturn0TupleCountIfNoEndTagIsFound() throws Exception {
// modify the data content to avoid end tag for </ignoreProperty>
ArrayList<String[]> testData = new ArrayList<String[]>();
for (String content[] : data) {
if(!content[0].equals("</ignoreProperty>")) {
testData.add(content);
}
}
String filename = TestHelper.createTempFile(testData, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(0, tupleCount);
}
public void testShouldReturn0TupleCountIfEmptyFileIsPassed() throws Exception {
// modify the data content to avoid end tag for </ignoreProperty>
ArrayList<String[]> testData = new ArrayList<String[]>();
String filename = TestHelper.createTempFile(testData, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(0, tupleCount);
}
public void testXMLLoaderShouldSupportNestedTagWithSameName() throws Exception {
String filename = TestHelper.createTempFile(nestedTags, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(3, tupleCount);
}
public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception {
String filename = TestHelper.createTempFile(inlineClosedTags, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(4, tupleCount);
}
public void testXMLLoaderShouldWorkWithIndentedXmlWithMultilineContent() throws Exception {
String filename = TestHelper.createTempFile(indentedXmlWithMultilineLineContent, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('page') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
System.out.println(((String) tuple.get(0)));
assertTrue(((String) tuple.get(0)).equals("<page>You have not missed it</page>"));
tupleCount++;
}
}
assertEquals(1, tupleCount);
}
public void testXMLLoaderShouldReturnValidXML() throws Exception {
String filename = TestHelper.createTempFile(inlineClosedTags, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
// Test it returns a valid XML
DocumentBuilder docBuilder =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
docBuilder.parse(new ByteArrayInputStream(((String)tuple.get(0)).getBytes()));
}
}
}
/**
* This test case test the special case when a non-matching tag spans two file
* splits in a .bz2 compressed file. At the same time, the part that falls in
* the first split is a prefix of the matching tag.
* In other words, till the end of the first split, it looks like the tag is
* matching but it is not actually matching.
*
* @throws Exception
*/
public void testXMLLoaderShouldNotReturnLastNonMatchedTag() throws Exception {
Configuration conf = new Configuration();
long blockSize = 100 * 1024;
conf.setLong("fs.local.block.size", blockSize);
String tagName = "event";
PigServer pig = new PigServer(LOCAL, conf);
FileSystem localFs = FileSystem.getLocal(conf);
FileStatus[] testFiles = localFs.globStatus(new Path("src/test/java/org/apache/pig/piggybank/test/evaluation/xml/data/*xml.bz2"));
assertTrue("No test files", testFiles.length > 0);
for (FileStatus testFile : testFiles) {
String testFileName = testFile.getPath().toUri().getPath().replace("\\", "\\\\");
String query = "A = LOAD '" + testFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
assertTrue(((String)tuple.get(0)).startsWith("<"+tagName+">"));
}
}
}
}
}
/**
* This test checks that a multi-line tag spanning two splits should be
* matched.
* @throws Exception
*/
public void testXMLLoaderShouldMatchTagSpanningSplits() throws Exception {
Configuration conf = new Configuration();
long blockSize = 512;
conf.setLong("fs.local.block.size", blockSize);
conf.setLong(MRConfiguration.MAX_SPLIT_SIZE, blockSize);
String tagName = "event";
File tempFile = File.createTempFile("long-file", ".xml");
FileSystem localFs = FileSystem.getLocal(conf);
FSDataOutputStream directOut = localFs.create(new Path(tempFile.getAbsolutePath()), true);
String matchingElement = "<event>\ndata\n</event>\n";
long pos = 0;
int matchingCount = 0;
PrintStream ps = new PrintStream(directOut);
// 1- Write some elements that fit completely in the first block
while (pos + 2 * matchingElement.length() < blockSize) {
ps.print(matchingElement);
pos += matchingElement.length();
matchingCount++;
}
// 2- Write a long element that spans multiple lines and multiple blocks
String longElement = matchingElement.replace("data",
"data\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\ndata\n");
ps.print(longElement);
pos += longElement.length();
matchingCount++;
// 3- Write some more elements to fill in the second block completely
while (pos < 2 * blockSize) {
ps.print(matchingElement);
pos += matchingElement.length();
matchingCount++;
}
ps.close();
PigServer pig = new PigServer(LOCAL, conf);
String tempFileName = tempFile.getAbsolutePath().replace("\\", "\\\\");
String query = "A = LOAD '" + tempFileName + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int count = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
count++;
// Make sure the returned text is a proper XML element
DocumentBuilder docBuilder =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
Document doc = docBuilder.parse(new ByteArrayInputStream(((String)tuple.get(0)).getBytes()));
assertTrue(doc.getDocumentElement().getNodeName().equals(tagName));
}
}
}
assertEquals(matchingCount, count);
}
}