/* * Licensed to CRATE Technology GmbH ("Crate") under one or more contributor * license agreements. See the NOTICE file distributed with this work for * additional information regarding copyright ownership. Crate licenses * this file to you under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. You may * obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the * License for the specific language governing permissions and limitations * under the License. * * However, if you have executed another commercial license agreement * with Crate these terms will supersede the license and you may use the * software solely pursuant to the terms of the relevant commercial agreement. */ package io.crate.operation.collect.files; import com.amazonaws.services.s3.AmazonS3; import com.amazonaws.services.s3.AmazonS3Client; import com.amazonaws.services.s3.model.ObjectListing; import com.amazonaws.services.s3.model.S3Object; import com.amazonaws.services.s3.model.S3ObjectInputStream; import com.amazonaws.services.s3.model.S3ObjectSummary; import com.google.common.collect.ImmutableMap; import io.crate.data.*; import io.crate.external.S3ClientHelper; import io.crate.metadata.*; import io.crate.operation.InputFactory; import io.crate.operation.collect.BatchIteratorCollectorBridge; import io.crate.operation.reference.file.FileLineReferenceResolver; import io.crate.test.integration.CrateUnitTest; import io.crate.testing.TestingBatchConsumer; import io.crate.testing.TestingHelpers; import io.crate.types.DataTypes; import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; import org.mockito.invocation.InvocationOnMock; import org.mockito.stubbing.Answer; import java.io.*; import java.net.MalformedURLException; import java.net.SocketTimeoutException; import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; import java.util.zip.GZIPOutputStream; import static io.crate.testing.TestingHelpers.createReference; import static io.crate.testing.TestingHelpers.isRow; import static org.hamcrest.Matchers.is; import static org.mockito.Matchers.*; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; public class FileReadingCollectorTest extends CrateUnitTest { private static File tmpFile; private static File tmpFileGz; private static File tmpFileEmptyLine; private InputFactory inputFactory; @BeforeClass public static void setUpClass() throws Exception { Path copy_from = Files.createTempDirectory("copy_from"); Path copy_from_gz = Files.createTempDirectory("copy_from_gz"); Path copy_from_empty = Files.createTempDirectory("copy_from_empty"); tmpFileGz = File.createTempFile("fileReadingCollector", ".json.gz", copy_from_gz.toFile()); tmpFile = File.createTempFile("fileReadingCollector", ".json", copy_from.toFile()); tmpFileEmptyLine = File.createTempFile("emptyLine", ".json", copy_from_empty.toFile()); try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new GZIPOutputStream(new FileOutputStream(tmpFileGz)), StandardCharsets.UTF_8))) { writer.write("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}\n"); writer.write("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}\n"); } try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tmpFile), StandardCharsets.UTF_8)) { writer.write("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}\n"); writer.write("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}\n"); } try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(tmpFileEmptyLine), StandardCharsets.UTF_8)) { writer.write("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}\n"); writer.write("\n"); writer.write("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}\n"); } } @Before public void prepare() throws Exception { Functions functions = new Functions( ImmutableMap.<FunctionIdent, FunctionImplementation>of(), ImmutableMap.<String, FunctionResolver>of() ); inputFactory = new InputFactory(functions); } @AfterClass public static void tearDownClass() throws Exception { assertThat(tmpFile.delete(), is(true)); assertThat(tmpFileGz.delete(), is(true)); assertThat(tmpFileEmptyLine.delete(), is(true)); } @Test public void testUmlautsAndWhitespacesWithExplicitURIThrowsAre() throws Throwable { expectedException.expect(IllegalArgumentException.class); expectedException.expectMessage("Illegal character in path at index 12: file:///this will fäil.json"); getObjects("file:///this will fäil.json"); } @Test public void testCollectFromS3Uri() throws Throwable { // this test just verifies the s3 schema detection and bucketName / prefix extraction from the uri. // real s3 interaction is mocked completely. TestingBatchConsumer projector = getObjects("s3://fakebucket/foo"); projector.getResult(); } @Test public void testNoErrorIfNoSuchFile() throws Throwable { // no error, -> don't want to fail just because one node doesn't have a file getObjects("file:///some/path/that/shouldnt/exist/foo.json"); getObjects("file:///some/path/that/shouldnt/exist/*"); } @Test(expected = IllegalArgumentException.class) public void testRelativeImport() throws Throwable { TestingBatchConsumer projector = getObjects("xy"); assertCorrectResult(projector.getBucket()); } @Test public void testCollectFromUriWithGlob() throws Throwable { TestingBatchConsumer projector = getObjects( Paths.get(tmpFile.getParentFile().toURI()).toUri().toString() + "file*.json"); assertCorrectResult(projector.getBucket()); } @Test public void testCollectFromDirectory() throws Throwable { TestingBatchConsumer projector = getObjects( Paths.get(tmpFile.getParentFile().toURI()).toUri().toString() + "*"); assertCorrectResult(projector.getBucket()); } @Test public void testDoCollectRaw() throws Throwable { TestingBatchConsumer consumer = getObjects(Paths.get(tmpFile.toURI()).toUri().toString()); assertCorrectResult(consumer.getBucket()); } @Test public void testDoCollectRawFromCompressed() throws Throwable { TestingBatchConsumer consumer = getObjects(Collections.singletonList(Paths.get(tmpFileGz.toURI()).toUri().toString()), "gzip"); assertCorrectResult(consumer.getBucket()); } @Test public void testCollectWithEmptyLine() throws Throwable { TestingBatchConsumer consumer = getObjects(Paths.get(tmpFileEmptyLine.toURI()).toUri().toString()); assertCorrectResult(consumer.getBucket()); } @Test public void testCollectWithOneSocketTimeout() throws Throwable { S3ObjectInputStream inputStream = mock(S3ObjectInputStream.class); when(inputStream.read(new byte[anyInt()], anyInt(), anyByte())) .thenAnswer(new WriteBufferAnswer(new byte[]{102, 111, 111, 10})) // first line: foo .thenThrow(new SocketTimeoutException()) // exception causes retry .thenAnswer(new WriteBufferAnswer(new byte[]{102, 111, 111, 10})) // first line again, because of retry .thenAnswer(new WriteBufferAnswer(new byte[]{98, 97, 114, 10})) // second line: bar .thenReturn(-1); TestingBatchConsumer consumer = getObjects(Collections.singletonList("s3://fakebucket/foo"), null, inputStream); Bucket rows = consumer.getBucket(); assertThat(rows.size(), is(2)); assertThat(TestingHelpers.printedTable(rows), is("foo\nbar\n")); } @Test public void unsupportedURITest() throws Throwable { expectedException.expect(MalformedURLException.class); expectedException.expectMessage("unknown protocol: invalid"); getObjects("invalid://crate.io/docs/en/latest/sql/reference/copy_from.html").getBucket(); } @Test public void testMultipleUriSupport() throws Throwable { List<String> fileUris = new ArrayList<>(); fileUris.add(Paths.get(tmpFile.toURI()).toUri().toString()); fileUris.add(Paths.get(tmpFileEmptyLine.toURI()).toUri().toString()); TestingBatchConsumer consumer = getObjects(fileUris, null); Iterator<Row> it = consumer.getBucket().iterator(); assertThat(it.next(), isRow("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}")); assertThat(it.next(), isRow("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}")); assertThat(it.next(), isRow("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}")); assertThat(it.next(), isRow("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}")); } private void assertCorrectResult(Bucket rows) throws Throwable { Iterator<Row> it = rows.iterator(); assertThat(it.next(), isRow("{\"name\": \"Arthur\", \"id\": 4, \"details\": {\"age\": 38}}")); assertThat(it.next(), isRow("{\"id\": 5, \"name\": \"Trillian\", \"details\": {\"age\": 33}}")); } private TestingBatchConsumer getObjects(String fileUri) throws Throwable { return getObjects(Collections.singletonList(fileUri), null); } private TestingBatchConsumer getObjects(Collection<String> fileUris, String compression) throws Throwable { S3ObjectInputStream inputStream = mock(S3ObjectInputStream.class); when(inputStream.read(new byte[anyInt()], anyInt(), anyByte())).thenReturn(-1); return getObjects(fileUris, compression, inputStream); } private TestingBatchConsumer getObjects(Collection<String> fileUris, String compression, S3ObjectInputStream s3InputStream) throws Throwable { TestingBatchConsumer consumer = new TestingBatchConsumer(); getObjects(fileUris, compression, s3InputStream, consumer); return consumer; } private void getObjects(Collection<String> fileUris, String compression, final S3ObjectInputStream s3InputStream, BatchConsumer consumer) throws Throwable { BatchIterator iterator = createBatchIterator(fileUris, compression, s3InputStream); BatchIteratorCollectorBridge.newInstance(iterator, consumer).doCollect(); } private BatchIterator createBatchIterator(Collection<String> fileUris, String compression, final S3ObjectInputStream s3InputStream) { Reference raw = createReference("_raw", DataTypes.STRING); InputFactory.Context<LineCollectorExpression<?>> ctx = inputFactory.ctxForRefs(FileLineReferenceResolver::getImplementation); List<Input<?>> inputs = Collections.singletonList(ctx.add(raw)); return FileReadingIterator.newInstance( fileUris, inputs, ctx.expressions(), compression, ImmutableMap.of( LocalFsFileInputFactory.NAME, new LocalFsFileInputFactory(), S3FileInputFactory.NAME, () -> new S3FileInput(new S3ClientHelper() { @Override protected AmazonS3 initClient(String accessKey, String secretKey) throws IOException { AmazonS3 client = mock(AmazonS3Client.class); ObjectListing objectListing = mock(ObjectListing.class); S3ObjectSummary summary = mock(S3ObjectSummary.class); S3Object s3Object = mock(S3Object.class); when(client.listObjects(anyString(), anyString())).thenReturn(objectListing); when(objectListing.getObjectSummaries()).thenReturn(Arrays.asList(summary)); when(summary.getKey()).thenReturn("foo"); when(client.getObject("fakebucket", "foo")).thenReturn(s3Object); when(s3Object.getObjectContent()).thenReturn(s3InputStream); when(client.listNextBatchOfObjects(any(ObjectListing.class))).thenReturn(objectListing); when(objectListing.isTruncated()).thenReturn(false); return client; } })), false, 1, 0 ); } private static class WriteBufferAnswer implements Answer<Integer> { private byte[] bytes; public WriteBufferAnswer(byte[] bytes) { this.bytes = bytes; } @Override public Integer answer(InvocationOnMock invocation) throws Throwable { byte[] buffer = (byte[]) invocation.getArguments()[0]; System.arraycopy(bytes, 0, buffer, 0, bytes.length); return bytes.length; } } }