/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.morphlines.cell;
import java.io.File;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.commons.io.FileUtils;
import org.apache.lucene.util.Constants;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.handler.extraction.ExtractionDateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.schema.IndexSchema;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;
public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {
private Map<String,Integer> expectedRecords = new HashMap<>();
private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<>();
@BeforeClass
public static void beforeClass2() {
assumeFalse("FIXME: Morphlines currently has issues with Windows paths", Constants.WINDOWS);
assumeFalse("This test fails with Java 9 (https://issues.apache.org/jira/browse/SOLR-8876)",
Constants.JRE_IS_MINIMUM_JAVA9);
}
@Before
public void setUp() throws Exception {
super.setUp();
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
expectedRecords.put(path + "sample-statuses-20120906-141433.avro", 2);
expectedRecords.put(path + "sample-statuses-20120906-141433", 2);
expectedRecords.put(path + "sample-statuses-20120906-141433.gz", 2);
expectedRecords.put(path + "sample-statuses-20120906-141433.bz2", 2);
expectedRecords.put(path + "cars.csv", 6);
expectedRecords.put(path + "cars.csv.gz", 6);
expectedRecords.put(path + "cars.tar.gz", 4);
expectedRecords.put(path + "cars.tsv", 6);
expectedRecords.put(path + "cars.ssv", 6);
expectedRecords.put(path + "test-documents.7z", 9);
expectedRecords.put(path + "test-documents.cpio", 9);
expectedRecords.put(path + "test-documents.tar", 9);
expectedRecords.put(path + "test-documents.tbz2", 9);
expectedRecords.put(path + "test-documents.tgz", 9);
expectedRecords.put(path + "test-documents.zip", 9);
expectedRecords.put(path + "multiline-stacktrace.log", 4);
{
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "image/jpeg");
record.put("ignored_exif_isospeedratings", "400");
record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
record.put("ignored_tiff_model", "Canon EOS 40D");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
}
{
String file = path + "testWORD_various.doc";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/msword");
record.put("ignored_author", "Michael McCandless");
record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
record.put("ignored_title", "");
record.put("ignored_keywords", "Keyword1 Keyword2");
record.put("ignored_subject", "Subject is here");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "testPDF.pdf";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/pdf");
record.put("ignored_author", "Bertrand Delacrétaz");
record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
record.put("ignored_title", "Apache Tika - Apache Tika");
record.put("ignored_xmp_creatortool", "Firefox");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "email.eml";
Map<String, Object> record = new LinkedHashMap();
String name = "Patrick Foo <foo@cloudera.com>";
record.put("ignored__attachment_mimetype", "message/rfc822");
record.put("ignored_author", name);
//record.put("ignored_content_length", "1068");
record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_message_from", name);
record.put("ignored_message_to", name);
record.put("ignored_creator", name);
record.put("ignored_dc_creator", name);
record.put("ignored_dc_title", "Test EML");
record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
record.put("ignored_meta_author", name);
record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
record.put("ignored_subject", "Test EML");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
{
String file = path + "testEXCEL.xlsx";
Map<String, Object> record = new LinkedHashMap();
record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
record.put("ignored_author", "Keith Bennett");
record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
record.put("ignored_title", "Simple Excel document");
record.put("text", NON_EMPTY_FIELD);
expectedRecordContents.put(file, record);
}
FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
}
@Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
public void testSolrCellJPGCompressed() throws Exception {
morphline = createMorphline("test-morphlines" + File.separator + "solrCellJPGCompressed");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testJPEG_EXIF.jpg",
path + "testJPEG_EXIF.jpg.gz",
path + "testJPEG_EXIF.jpg.tar.gz",
//path + "jpeg2000.jp2",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
public void testSolrCellXML() throws Exception {
morphline = createMorphline("test-morphlines" + File.separator + "solrCellXML");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testXML2.xml",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
public void testSolrCellDocumentTypes() throws Exception {
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testBMPfp.txt",
path + "boilerplate.html",
path + "NullHeader.docx",
path + "testWORD_various.doc",
path + "testPDF.pdf",
path + "testJPEG_EXIF.jpg",
path + "testJPEG_EXIF.jpg.gz",
path + "testJPEG_EXIF.jpg.tar.gz",
path + "testXML.xml",
path + "cars.csv",
// path + "cars.tsv",
// path + "cars.ssv",
path + "cars.csv.gz",
path + "cars.tar.gz",
path + "sample-statuses-20120906-141433.avro",
path + "sample-statuses-20120906-141433",
path + "sample-statuses-20120906-141433.gz",
path + "sample-statuses-20120906-141433.bz2",
path + "email.eml",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
@Test
@AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-9220")
public void testSolrCellDocumentTypes2() throws Exception {
AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");
String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
String[] files = new String[] {
path + "testPPT_various.ppt",
path + "testPPT_various.pptx",
path + "testEXCEL.xlsx",
path + "testEXCEL.xls",
path + "testPages.pages",
//path + "testNumbers.numbers",
//path + "testKeynote.key",
path + "testRTFVarious.rtf",
path + "complex.mbox",
path + "test-outlook.msg",
path + "testEMLX.emlx",
path + "testRFC822",
path + "rsstest.rss",
// path + "testDITA.dita",
path + "testMP3i18n.mp3",
path + "testAIFF.aif",
path + "testFLAC.flac",
// path + "testFLAC.oga",
// path + "testVORBIS.ogg",
path + "testMP4.m4a",
path + "testWAV.wav",
// path + "testWMA.wma",
path + "testFLV.flv",
// path + "testWMV.wmv",
path + "testBMP.bmp",
path + "testPNG.png",
path + "testPSD.psd",
path + "testSVG.svg",
path + "testTIFF.tif",
// path + "test-documents.7z",
// path + "test-documents.cpio",
// path + "test-documents.tar",
// path + "test-documents.tbz2",
// path + "test-documents.tgz",
// path + "test-documents.zip",
// path + "test-zip-of-zip.zip",
// path + "testJAR.jar",
// path + "testKML.kml",
// path + "testRDF.rdf",
path + "testVISIO.vsd",
// path + "testWAR.war",
// path + "testWindows-x86-32.exe",
// path + "testWINMAIL.dat",
// path + "testWMF.wmf",
};
testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
}
/**
* Test that the ContentHandler properly strips the illegal characters
*/
@Test
public void testTransformValue() {
String fieldName = "user_name";
assertFalse("foobar".equals(getFoobarWithNonChars()));
Metadata metadata = new Metadata();
// load illegal char string into a metadata field and generate a new document,
// which will cause the ContentHandler to be invoked.
metadata.set(fieldName, getFoobarWithNonChars());
StripNonCharSolrContentHandlerFactory contentHandlerFactory =
new StripNonCharSolrContentHandlerFactory(ExtractionDateUtil.DEFAULT_DATE_FORMATS);
IndexSchema schema = h.getCore().getLatestSchema();
SolrContentHandler contentHandler =
contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
SolrInputDocument doc = contentHandler.newDocument();
String foobar = doc.getFieldValue(fieldName).toString();
assertTrue("foobar".equals(foobar));
}
/**
* Returns string "foobar" with illegal characters interspersed.
*/
private String getFoobarWithNonChars() {
char illegalChar = '\uffff';
StringBuilder builder = new StringBuilder();
builder.append(illegalChar).append(illegalChar).append("foo").append(illegalChar)
.append(illegalChar).append("bar").append(illegalChar).append(illegalChar);
return builder.toString();
}
}