/*
* Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the
* NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF
* licenses this file to you under the Apache License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under the License is
* distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and limitations under the License.
*/
package org.apache.pig.piggybank.test.storage;
import static org.apache.pig.ExecType.LOCAL;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import junit.framework.TestCase;
import org.apache.pig.ExecType;
import org.apache.pig.PigServer;
import org.apache.pig.data.Tuple;
import org.apache.pig.test.Util;
public class TestXMLLoader extends TestCase {
private static String patternString = "(\\d+)!+(\\w+)~+(\\w+)";
public static ArrayList<String[]> data = new ArrayList<String[]>();
static {
data.add(new String[] { "<configuration>"});
data.add(new String[] { "<property>"});
data.add(new String[] { "<name> foobar </name>"});
data.add(new String[] { "<value> barfoo </value>"});
data.add(new String[] { "</property>"});
data.add(new String[] { "<ignoreProperty>"});
data.add(new String[] { "<name> foo </name>"});
data.add(new String[] { "</ignoreProperty>"});
data.add(new String[] { "<property>"});
data.add(new String[] { "<name> justname </name>"});
data.add(new String[] { "</property>"});
data.add(new String[] { "</configuration>"});
}
public static ArrayList<String[]> nestedTags = new ArrayList<String[]>();
static {
nestedTags.add(new String[] { "<events>"});
nestedTags.add(new String[] { "<event id='116913365'>"});
nestedTags.add(new String[] { "<eventRank>1.000000000000</eventRank>"});
nestedTags.add(new String[] { "<name>XY</name>"});
nestedTags.add(new String[] { "<relatedEvents>"});
nestedTags.add(new String[] { "<event id='116913365'>x</event>"});
nestedTags.add(new String[] { "<event id='116913365'>y</event>"});
nestedTags.add(new String[] { "</relatedEvents>"});
nestedTags.add(new String[] { "</event>"});
nestedTags.add(new String[] { "<event id='116913365'>"});
nestedTags.add(new String[] { "<eventRank>3.0000</eventRank>"});
nestedTags.add(new String[] { "<name>AB</name>"});
nestedTags.add(new String[] { "<relatedEvents>"});
nestedTags.add(new String[] { "<event id='116913365'>a</event>"});
nestedTags.add(new String[] { "<event id='116913365'>b</event>"});
nestedTags.add(new String[] { "</relatedEvents>"});
nestedTags.add(new String[] { "</event>"});
nestedTags.add(new String[] { "<event>"});
nestedTags.add(new String[] { "<eventRank>4.0000</eventRank>"});
nestedTags.add(new String[] { "<name>CD</name>"});
nestedTags.add(new String[] { "<relatedEvents>"});
nestedTags.add(new String[] { "<event>c</event>"});
nestedTags.add(new String[] { "<event>d</event>"});
nestedTags.add(new String[] { "</relatedEvents>"});
nestedTags.add(new String[] { "</event>"});
nestedTags.add(new String[] { "</events>"});
}
public static ArrayList<String[]> inlineClosedTags = new ArrayList<String[]>();
static {
inlineClosedTags.add(new String[] { "<events>"});
inlineClosedTags.add(new String[] { "<event id='3423'/>"});
inlineClosedTags.add(new String[] { "<event/>"});
inlineClosedTags.add(new String[] { "<event><event/></event>"});
inlineClosedTags.add(new String[] { "<event id='33'><tag k='a' v='b'/></event>"});
inlineClosedTags.add(new String[] { "</events>"});
}
public void testShouldReturn0TupleCountIfSearchTagIsNotFound () throws Exception
{
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('invalid') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(0, tupleCount);
}
public void testLoadXMLLoader() throws Exception {
//ArrayList<DataByteArray[]> expected = TestHelper.getExpected(data, pattern);
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(2, tupleCount);
}
public void testXMLLoaderShouldLoadBasicBzip2Files() throws Exception {
String filename = TestHelper.createTempFile(data, "");
Process bzipProc = Runtime.getRuntime().exec("bzip2 "+filename);
int waitFor = bzipProc.waitFor();
if(waitFor != 0)
{
fail ("Failed to create the class");
}
filename = filename + ".bz2";
try
{
PigServer pigServer = new PigServer (ExecType.LOCAL);
String loadQuery = "A = LOAD '" + Util.encodeEscape(filename) + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
pigServer.registerQuery(loadQuery);
Iterator<Tuple> it = pigServer.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
//TestHelper.examineTuple(expected, tuple, tupleCount);
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(2, tupleCount);
}finally
{
new File(filename).delete();
}
}
public void testLoaderShouldLoadBasicGzFile() throws Exception {
String filename = TestHelper.createTempFile(data, "");
Process bzipProc = Runtime.getRuntime().exec("gzip "+filename);
int waitFor = bzipProc.waitFor();
if(waitFor != 0)
{
fail ("Failed to create the class");
}
filename = filename + ".gz";
try
{
PigServer pigServer = new PigServer (ExecType.LOCAL);
String loadQuery = "A = LOAD '" + Util.encodeEscape(filename) + "' USING org.apache.pig.piggybank.storage.XMLLoader('property') as (doc:chararray);";
pigServer.registerQuery(loadQuery);
Iterator<Tuple> it = pigServer.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(2, tupleCount);
}finally
{
new File(filename).delete();
}
}
public void testXMLLoaderShouldNotConfusedWithTagsHavingSimilarPrefix () throws Exception
{
ArrayList<String[]> testData = new ArrayList<String[]>();
testData.add(new String[] { "<namethisalso> foobar9 </namethisalso>"});
testData.addAll(data);
String filename = TestHelper.createTempFile(testData, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('name') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(3, tupleCount);
}
public void testShouldReturn1ForIntermediateTagData () throws Exception
{
String filename = TestHelper.createTempFile(data, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('ignoreProperty') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(1, tupleCount);
}
public void testShouldReturn0TupleCountIfNoEndTagIsFound() throws Exception
{
// modify the data content to avoid end tag for </ignoreProperty>
ArrayList<String[]> testData = new ArrayList<String[]>();
for (String content[] : data) {
if(false == data.equals(new String[] { "</ignoreProperty>"}))
{
testData.add(content);
}
}
String filename = TestHelper.createTempFile(testData, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('</ignoreProperty>') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(0, tupleCount);
}
public void testShouldReturn0TupleCountIfEmptyFileIsPassed() throws Exception
{
// modify the data content to avoid end tag for </ignoreProperty>
ArrayList<String[]> testData = new ArrayList<String[]>();
String filename = TestHelper.createTempFile(testData, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('</ignoreProperty>') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(0, tupleCount);
}
public void testXMLLoaderShouldSupportNestedTagWithSameName() throws Exception {
String filename = TestHelper.createTempFile(nestedTags, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(3, tupleCount);
}
public void testXMLLoaderShouldWorkWithInlineClosedTags() throws Exception {
String filename = TestHelper.createTempFile(inlineClosedTags, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
int tupleCount = 0;
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
if (tuple.size() > 0) {
tupleCount++;
}
}
}
assertEquals(4, tupleCount);
}
public void testXMLLoaderShouldReturnValidXML() throws Exception {
String filename = TestHelper.createTempFile(inlineClosedTags, "");
PigServer pig = new PigServer(LOCAL);
filename = filename.replace("\\", "\\\\");
patternString = patternString.replace("\\", "\\\\");
String query = "A = LOAD '" + filename + "' USING org.apache.pig.piggybank.storage.XMLLoader('event') as (doc:chararray);";
pig.registerQuery(query);
Iterator<?> it = pig.openIterator("A");
while (it.hasNext()) {
Tuple tuple = (Tuple) it.next();
if (tuple == null)
break;
else {
// Test it returns a valid XML
DocumentBuilder docBuilder =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
docBuilder.parse(new ByteArrayInputStream(((String)tuple.get(0)).getBytes()));
}
}
}
}