/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ package org.apache.streams.hdfs; import org.apache.streams.config.ComponentConfigurator; import org.apache.streams.config.StreamsConfiguration; import org.apache.streams.config.StreamsConfigurator; import org.apache.streams.converter.LineReadWriteUtil; import org.apache.streams.core.DatumStatusCountable; import org.apache.streams.core.DatumStatusCounter; import org.apache.streams.core.StreamsDatum; import org.apache.streams.core.StreamsPersistReader; import org.apache.streams.core.StreamsResultSet; import org.apache.streams.jackson.StreamsJacksonMapper; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.common.collect.Queues; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.CommonConfigurationKeysPublic; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.LocalFileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hdfs.DistributedFileSystem; import org.apache.hadoop.security.UserGroupInformation; import org.joda.time.DateTime; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.math.BigInteger; import java.net.URI; import java.net.URISyntaxException; import java.security.PrivilegedExceptionAction; import java.util.Arrays; import java.util.Collections; import java.util.List; import java.util.Queue; import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.Future; import java.util.concurrent.LinkedBlockingQueue; /** * WebHdfsPersistReader reads from hdfs. */ public class WebHdfsPersistReader implements StreamsPersistReader, DatumStatusCountable { public static final String STREAMS_ID = "WebHdfsPersistReader"; private static final Logger LOGGER = LoggerFactory.getLogger(WebHdfsPersistReader.class); protected static final char DELIMITER = '\t'; protected FileSystem client; protected Path path; protected FileStatus[] status; protected volatile Queue<StreamsDatum> persistQueue; protected ObjectMapper mapper; protected LineReadWriteUtil lineReaderUtil; protected HdfsReaderConfiguration hdfsConfiguration; protected StreamsConfiguration streamsConfiguration; private ExecutorService executor; protected DatumStatusCounter countersTotal = new DatumStatusCounter(); protected DatumStatusCounter countersCurrent = new DatumStatusCounter(); private Future<?> task; /** * WebHdfsPersistReader constructor - resolves HdfsReaderConfiguration from JVM 'hdfs'. */ public WebHdfsPersistReader() { this(new ComponentConfigurator<>(HdfsReaderConfiguration.class).detectConfiguration(StreamsConfigurator.getConfig().getConfig("hdfs"))); } /** * WebHdfsPersistReader constructor - uses supplied HdfsReaderConfiguration. * @param hdfsConfiguration hdfsConfiguration */ public WebHdfsPersistReader(HdfsReaderConfiguration hdfsConfiguration) { this.hdfsConfiguration = hdfsConfiguration; } /** * getURI from hdfsConfiguration. * @return URI * @throws URISyntaxException URISyntaxException */ public URI getURI() throws URISyntaxException { StringBuilder uriBuilder = new StringBuilder(); uriBuilder.append(hdfsConfiguration.getScheme()); uriBuilder.append("://"); if (StringUtils.isNotBlank(hdfsConfiguration.getHost())) { uriBuilder.append(hdfsConfiguration.getHost()); if (hdfsConfiguration.getPort() != null) { uriBuilder.append(":" + hdfsConfiguration.getPort()); } } else { uriBuilder.append("/"); } return new URI(uriBuilder.toString()); } /** * isConnected. * @return true if connected, false otherwise */ public boolean isConnected() { return (client != null); } /** * getFileSystem. * @return FileSystem */ public final synchronized FileSystem getFileSystem() { // Check to see if we are connected. if (!isConnected()) { connectToWebHDFS(); } return this.client; } // TODO: combine with WebHdfsPersistReader.connectToWebHDFS private synchronized void connectToWebHDFS() { try { LOGGER.info("User : {}", this.hdfsConfiguration.getUser()); UserGroupInformation ugi = UserGroupInformation.createRemoteUser(this.hdfsConfiguration.getUser()); ugi.setAuthenticationMethod(UserGroupInformation.AuthenticationMethod.SIMPLE); ugi.doAs((PrivilegedExceptionAction<Void>) () -> { Configuration conf = new Configuration(); conf.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos"); conf.set("fs.hdfs.impl", DistributedFileSystem.class.getName()); conf.set("fs.file.impl", LocalFileSystem.class.getName()); LOGGER.info("WebURI : {}", getURI().toString()); client = FileSystem.get(getURI(), conf); LOGGER.info("Connected to WebHDFS"); /* * ************************************************************************************************ * This code is an example of how you would work with HDFS and you weren't going over * the webHDFS protocol. * * Smashew: 2013-10-01 * ************************************************************************************************ conf.set("fs.defaultFS", "hdfs://hadoop.mdigitallife.com:8020/user/" + userName); conf.set("namenode.host","0.0.0.0"); conf.set("hadoop.job.ugi", userName); conf.set(DFSConfigKeys.DFS_NAMENODE_USER_NAME_KEY, "runner"); fileSystem.createNewFile(new Path("/user/"+ userName + "/test")); FileStatus[] status = fs.listStatus(new Path("/user/" + userName)); for(int i=0;i<status.length;i++) { LOGGER.info("Directory: {}", status[i].getPath()); } */ return null; }); } catch (Exception ex) { LOGGER.error("There was an error connecting to WebHDFS, please check your settings and try again"); ex.printStackTrace(); } } @Override public String getId() { return STREAMS_ID; } @Override public void prepare(Object configurationObject) { LOGGER.debug("Prepare"); lineReaderUtil = LineReadWriteUtil.getInstance(hdfsConfiguration); connectToWebHDFS(); String pathString = hdfsConfiguration.getPath() + "/" + hdfsConfiguration.getReaderPath(); LOGGER.info("Path : {}", pathString); path = new Path(pathString); try { if ( client.isFile(path)) { LOGGER.info("Found File"); FileStatus fileStatus = client.getFileStatus(path); status = new FileStatus[1]; status[0] = fileStatus; } else if ( client.isDirectory(path)) { status = client.listStatus(path); List<FileStatus> statusList = Arrays.asList(status); Collections.sort(statusList); status = statusList.toArray(new FileStatus[0]); LOGGER.info("Found Directory : {} files", status.length); } else { LOGGER.error("Neither file nor directory, wtf"); } } catch (IOException ex) { LOGGER.error("IOException", ex); } streamsConfiguration = StreamsConfigurator.detectConfiguration(); persistQueue = Queues.synchronizedQueue(new LinkedBlockingQueue<StreamsDatum>(streamsConfiguration.getBatchSize().intValue())); executor = Executors.newSingleThreadExecutor(); mapper = StreamsJacksonMapper.getInstance(); } @Override public void cleanUp() { } @Override public StreamsResultSet readAll() { WebHdfsPersistReaderTask readerTask = new WebHdfsPersistReaderTask(this); Thread readerThread = new Thread(readerTask); readerThread.start(); try { readerThread.join(); } catch (InterruptedException ignored) { LOGGER.trace("ignored InterruptedException", ignored); } return new StreamsResultSet(persistQueue); } @Override public void startStream() { LOGGER.debug("startStream"); task = executor.submit(new WebHdfsPersistReaderTask(this)); } @Override public StreamsResultSet readCurrent() { StreamsResultSet current; synchronized ( WebHdfsPersistReader.class ) { current = new StreamsResultSet(new ConcurrentLinkedQueue<>(persistQueue)); current.setCounter(new DatumStatusCounter()); current.getCounter().add(countersCurrent); countersTotal.add(countersCurrent); countersCurrent = new DatumStatusCounter(); persistQueue.clear(); } return current; } protected void write( StreamsDatum entry ) { boolean success; do { synchronized ( WebHdfsPersistReader.class ) { success = persistQueue.offer(entry); } Thread.yield(); } while ( !success ); } @Override public StreamsResultSet readNew(BigInteger sequence) { return null; } @Override public StreamsResultSet readRange(DateTime start, DateTime end) { return null; } @Override public boolean isRunning() { return task == null || !task.isDone() && !task.isCancelled(); } @Override public DatumStatusCounter getDatumStatusCounter() { return countersTotal; } }