/* * This file is part of CoAnSys project. * Copyright (c) 2012-2015 ICM-UW * * CoAnSys is free software: you can redistribute it and/or modify * it under the terms of the GNU Affero General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * CoAnSys is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Affero General Public License for more details. * * You should have received a copy of the GNU Affero General Public License * along with CoAnSys. If not, see <http://www.gnu.org/licenses/>. */ package pl.edu.icm.coansys.commons.pig.udf; import static org.testng.Assert.assertEquals; import static org.testng.Assert.assertTrue; import java.io.IOException; import java.util.HashMap; import java.util.Map; import org.apache.hadoop.hbase.MasterNotRunningException; import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.pig.backend.executionengine.ExecException; import org.apache.pig.data.DataBag; import org.apache.pig.data.DataByteArray; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.data.TupleFactory; import org.apache.pig.impl.logicalLayer.FrontendException; import org.apache.pig.impl.logicalLayer.schema.Schema; import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; import org.testng.annotations.Test; import pl.edu.icm.coansys.models.DocumentProtos.BasicMetadata; import pl.edu.icm.coansys.models.DocumentProtos.DocumentMetadata; import pl.edu.icm.coansys.models.DocumentProtos.TextWithLanguage; /** * * @author Artur Czeczko <a.czeczko@icm.edu.pl> */ public class DocumentMetadataToTupleTest { private static final Map<String, String> titlesMap = new HashMap<String, String>(); static { titlesMap.put("en", "Title of test article"); titlesMap.put("fr", "Le titre de l'article"); } @Test(groups = {"fast"}) public void basicTest() throws MasterNotRunningException, ZooKeeperConnectionException, IOException { //prepare some test data DocumentMetadata inputDM; DocumentMetadata.Builder dmBuilder = DocumentMetadata.newBuilder(); dmBuilder.setKey("key123"); BasicMetadata.Builder basic = BasicMetadata.newBuilder(); TextWithLanguage.Builder title; for (String lang : titlesMap.keySet()) { title = TextWithLanguage.newBuilder(); title.setText(titlesMap.get(lang)); title.setLanguage(lang); basic.addTitle(title); } dmBuilder.setBasicMetadata(basic); inputDM = dmBuilder.build(); //convert by DocumentMetadataToTuple UDF // UDF input argument must be a Tuple, so we have to prepare a tuple with // one field of type bytearray DataByteArray dbarr = new DataByteArray(inputDM.toByteArray()); Tuple metadataTuple = TupleFactory.getInstance().newTuple(dbarr); // instantiate a UDF and convert DocumentMetadataToTuple dm2tpl = new DocumentMetadataToTuple(); Tuple pigTuple = dm2tpl.exec(metadataTuple); //asserts assertTrue(pigTuple.size() > 0); Schema oneFieldSchema = dm2tpl.outputSchema(null); assertEquals(oneFieldSchema.size(), 1); assertEquals(oneFieldSchema.getField(0).type, DataType.TUPLE); Schema tupleSchema = oneFieldSchema.getField(0).schema; assertEquals(tupleSchema.size(), pigTuple.size()); boolean keyFound = false; // go through a data for (int i = 0; i < pigTuple.size(); i++) { FieldSchema fieldSchema = tupleSchema.getField(i); if (pigTuple.getType(i) != DataType.NULL) { assertTrue(pigTuple.getType(i) == fieldSchema.type); if (fieldSchema.alias.equals("key")) { assertEquals(fieldSchema.type, DataType.CHARARRAY); assertEquals((String) pigTuple.get(i), inputDM.getKey()); keyFound = true; } else if (fieldSchema.alias.equals("basicmetadata")) { assertEquals(fieldSchema.type, DataType.TUPLE); Tuple bmTuple = (Tuple) pigTuple.get(i); Schema bmSchema = fieldSchema.schema; checkBasicMetadata(bmTuple, bmSchema); } } } assertTrue(keyFound, "Key not found in tuple"); //convert to DocumentMetadata by TupleToProtoBytearray UDF TupleToProtoBytearray tpl2dm = new TupleToProtoBytearray(DocumentMetadata.class); DataByteArray finalDMdba = tpl2dm.exec(pigTuple); DocumentMetadata finalDM = DocumentMetadata.parseFrom(finalDMdba.get()); assertEquals(finalDM, inputDM); } private void checkBasicMetadata(Tuple bmTuple, Schema bmSchema) throws FrontendException, ExecException { int titlesFound = 0; assertTrue(bmSchema.size() > 1); assertEquals(bmSchema.size(), bmTuple.size()); for (int j = 0; j < bmTuple.size(); j++) { FieldSchema bmFieldSchema = bmSchema.getField(j); if (bmFieldSchema.alias.equals("title")) { assert (bmFieldSchema.type == DataType.BAG); DataBag bag = (DataBag) bmTuple.get(j); Schema titleSchema = bmFieldSchema.schema.getField(0).schema; Map<String, String> extractedTitles = new HashMap<String, String>(); for (Tuple titleTuple : bag) { String txt = ""; String lang = ""; for (int k = 0; k < titleTuple.size(); k++) { if (titleSchema.getField(k).alias.equals("text")) { txt = (String) titleTuple.get(k); } else if (titleSchema.getField(k).alias.equals("language")) { lang = (String) titleTuple.get(k); } } extractedTitles.put(lang, txt); titlesFound++; } assertEquals(extractedTitles, titlesMap); } // else - other fields - maybe in the future... } assertEquals(titlesFound, titlesMap.size()); } }