/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.pig.piggybank.evaluation.string; import java.io.BufferedReader; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.HashMap; import java.util.Map; import java.util.Properties; import org.apache.pig.EvalFunc; import org.apache.pig.backend.hadoop.datastorage.ConfigurationUtil; import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigMapReduce; import org.apache.pig.data.DataType; import org.apache.pig.data.Tuple; import org.apache.pig.impl.io.FileLocalizer; import org.apache.pig.impl.logicalLayer.schema.Schema; /** * <dl> * <dt><b>Syntax:</b></dt> * <dd><code>int lookupInFiles(String expression,... <comma separated filelist>)</code>.</dd> * <dt><b>Input:</b></dt> * <dd><code>files are text files on DFS</code>.</dd> * <dt><b>Output:</b></dt> * <dd><code>if any file contains expression, return 1, otherwise, 0</code>.</dd> * </dl> */ public class LookupInFiles extends EvalFunc<Integer> { boolean initialized = false; ArrayList<String> mFiles = new ArrayList<String>(); Map<String, Boolean> mKeys = new HashMap<String, Boolean>(); static Map<ArrayList<String>, Map<String, Boolean>> mTables = new HashMap<ArrayList<String>, Map<String, Boolean>>(); @Override public Schema outputSchema(Schema input) { try { return new Schema(new Schema.FieldSchema(getSchemaName(this.getClass().getName().toLowerCase(), input), DataType.INTEGER)); } catch (Exception e) { return null; } } public void init(Tuple tuple) throws IOException { for (int count = 1; count < tuple.size(); count++) { if (!(tuple.get(count) instanceof String)) { String msg = "LookupInFiles : Filename should be a string."; throw new IOException(msg); } mFiles.add((String) tuple.get(count)); } if (mTables.containsKey(mFiles)) { mKeys = mTables.get(mFiles); } else { Properties props = ConfigurationUtil.toProperties(PigMapReduce.sJobConfInternal.get()); for (int i = 0; i < mFiles.size(); ++i) { // Files contain only 1 column with the key. No Schema. All keys // separated by new line. BufferedReader reader = null; InputStream is = null; try { is = FileLocalizer.openDFSFile(mFiles.get(i), props); } catch (IOException e) { String msg = "LookupInFiles : Cannot open file "+mFiles.get(i); throw new IOException(msg, e); } try { reader = new BufferedReader(new InputStreamReader(is)); String line; while ((line = reader.readLine()) != null) { if (!mKeys.containsKey(line)) mKeys.put(line, true); } is.close(); } catch (IOException e) { String msg = "LookupInFiles : Cannot read file "+mFiles.get(i); throw new IOException(msg, e); } } mTables.put(mFiles, mKeys); } initialized=true; } @Override public Integer exec(Tuple input) throws IOException { if (!initialized) init(input); if (input.get(0)==null) return null; if (mKeys.containsKey(input.get(0).toString())) return 1; return 0; } }