SolrCellMorphlineTest.java example

Explorer
lucene-solr-master
- lucene
- solr
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.solr.morphlines.cell;

import java.io.File;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.Map;

import org.apache.commons.io.FileUtils;
import org.apache.lucene.util.Constants;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.MapSolrParams;
import org.apache.solr.handler.extraction.ExtractionDateUtil;
import org.apache.solr.handler.extraction.SolrContentHandler;
import org.apache.solr.morphlines.solr.AbstractSolrMorphlineTestBase;
import org.apache.solr.schema.IndexSchema;
import org.apache.tika.metadata.Metadata;
import org.junit.Before;
import org.junit.BeforeClass;
import org.junit.Test;

public class SolrCellMorphlineTest extends AbstractSolrMorphlineTestBase {

  private Map<String,Integer> expectedRecords = new HashMap<>();
  private Map<String, Map<String, Object>> expectedRecordContents = new HashMap<>();
  
  @BeforeClass
  public static void beforeClass2() {
    assumeFalse("FIXME: Morphlines currently has issues with Windows paths", Constants.WINDOWS);
    assumeFalse("This test fails with Java 9 (https://issues.apache.org/jira/browse/SOLR-8876)",
        Constants.JRE_IS_MINIMUM_JAVA9);
  }

  @Before
  public void setUp() throws Exception {
    super.setUp();
    
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    expectedRecords.put(path + "sample-statuses-20120906-141433.avro", 2);
    expectedRecords.put(path + "sample-statuses-20120906-141433", 2);
    expectedRecords.put(path + "sample-statuses-20120906-141433.gz", 2);
    expectedRecords.put(path + "sample-statuses-20120906-141433.bz2", 2);
    expectedRecords.put(path + "cars.csv", 6);
    expectedRecords.put(path + "cars.csv.gz", 6);
    expectedRecords.put(path + "cars.tar.gz", 4);
    expectedRecords.put(path + "cars.tsv", 6);
    expectedRecords.put(path + "cars.ssv", 6);
    expectedRecords.put(path + "test-documents.7z", 9);
    expectedRecords.put(path + "test-documents.cpio", 9);
    expectedRecords.put(path + "test-documents.tar", 9);
    expectedRecords.put(path + "test-documents.tbz2", 9);
    expectedRecords.put(path + "test-documents.tgz", 9);
    expectedRecords.put(path + "test-documents.zip", 9);
    expectedRecords.put(path + "multiline-stacktrace.log", 4);
    
    {
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "image/jpeg");
      record.put("ignored_exif_isospeedratings", "400");
      record.put("ignored_meta_creation_date", "2009-08-11T09:09:45");
      record.put("ignored_tiff_model", "Canon EOS 40D");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put("/testJPEG_EXIF.jpg", record);
      expectedRecordContents.put("/testJPEG_EXIF.jpg.tar", record);
      expectedRecordContents.put("/testJPEG_EXIF.jpg.tar.gz", record);
    }
    
    {
      String file = path + "testWORD_various.doc";
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "application/msword");
      record.put("ignored_author", "Michael McCandless");
      record.put("ignored_creation_date", "2011-09-02T10:11:00Z");
      record.put("ignored_title", "");
      record.put("ignored_keywords", "Keyword1 Keyword2");
      record.put("ignored_subject", "Subject is here");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }
    
    {
      String file = path + "testPDF.pdf";
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "application/pdf");
      record.put("ignored_author", "Bertrand Delacrétaz");
      record.put("ignored_creation_date", "2007-09-15T09:02:31Z");
      record.put("ignored_title", "Apache Tika - Apache Tika");
      record.put("ignored_xmp_creatortool", "Firefox");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }
    
    {
      String file = path + "email.eml";
      Map<String, Object> record = new LinkedHashMap();
      String name = "Patrick Foo <foo@cloudera.com>";
      record.put("ignored__attachment_mimetype", "message/rfc822");
      record.put("ignored_author", name);
      //record.put("ignored_content_length", "1068");
      record.put("ignored_creation_date", "2013-11-27T20:01:23Z");
      record.put("ignored_message_from", name);
      record.put("ignored_message_to", name);
      record.put("ignored_creator", name);
      record.put("ignored_dc_creator", name);
      record.put("ignored_dc_title", "Test EML");
      record.put("ignored_dcterms_created", "2013-11-27T20:01:23Z");
      record.put("ignored_meta_author", name);
      record.put("ignored_meta_creation_date", "2013-11-27T20:01:23Z");
      record.put("ignored_subject", "Test EML");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }

    {
      String file = path + "testEXCEL.xlsx";
      Map<String, Object> record = new LinkedHashMap();
      record.put("ignored__attachment_mimetype", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
      record.put("ignored_author", "Keith Bennett");
      record.put("ignored_creation_date", "2007-10-01T16:13:56Z");
      record.put("ignored_title", "Simple Excel document");
      record.put("text", NON_EMPTY_FIELD);
      expectedRecordContents.put(file, record);
    }    
    
    FileUtils.copyFile(new File(RESOURCES_DIR + "/custom-mimetypes.xml"), new File(tempDir + "/custom-mimetypes.xml"));
  }
  
  @Test
  @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
  public void testSolrCellJPGCompressed() throws Exception {
    morphline = createMorphline("test-morphlines" + File.separator + "solrCellJPGCompressed");    
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testJPEG_EXIF.jpg",
        path + "testJPEG_EXIF.jpg.gz",
        path + "testJPEG_EXIF.jpg.tar.gz",
        //path + "jpeg2000.jp2",
    };
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
  }  

  @Test
  public void testSolrCellXML() throws Exception {
    morphline = createMorphline("test-morphlines" + File.separator + "solrCellXML");    
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testXML2.xml",
    };
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
  }  

  @Test
  @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-6489")
  public void testSolrCellDocumentTypes() throws Exception {
    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
    
    morphline = createMorphline(new File(tempDir).getAbsolutePath() +  "/test-morphlines/solrCellDocumentTypes");    
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testBMPfp.txt",
        path + "boilerplate.html",
        path + "NullHeader.docx",
        path + "testWORD_various.doc",          
        path + "testPDF.pdf",
        path + "testJPEG_EXIF.jpg",
        path + "testJPEG_EXIF.jpg.gz",
        path + "testJPEG_EXIF.jpg.tar.gz",
        path + "testXML.xml",          
        path + "cars.csv",
//        path + "cars.tsv",
//        path + "cars.ssv",
        path + "cars.csv.gz",
        path + "cars.tar.gz",
        path + "sample-statuses-20120906-141433.avro",
        path + "sample-statuses-20120906-141433",
        path + "sample-statuses-20120906-141433.gz",
        path + "sample-statuses-20120906-141433.bz2",
        path + "email.eml",
    };
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
  }
  
  @Test
  @AwaitsFix(bugUrl="https://issues.apache.org/jira/browse/SOLR-9220")
  public void testSolrCellDocumentTypes2() throws Exception {

    AbstractSolrMorphlineTestBase.setupMorphline(tempDir, "test-morphlines/solrCellDocumentTypes", false);
    
    morphline = createMorphline(new File(tempDir).getAbsolutePath() + "/test-morphlines/solrCellDocumentTypes");    
    String path = RESOURCES_DIR + File.separator + "test-documents" + File.separator;
    String[] files = new String[] {
        path + "testPPT_various.ppt",
        path + "testPPT_various.pptx",        
        path + "testEXCEL.xlsx",
        path + "testEXCEL.xls", 
        path + "testPages.pages", 
        //path + "testNumbers.numbers", 
        //path + "testKeynote.key",
        
        path + "testRTFVarious.rtf", 
        path + "complex.mbox", 
        path + "test-outlook.msg", 
        path + "testEMLX.emlx",
        path + "testRFC822",  
        path + "rsstest.rss", 
//        path + "testDITA.dita", 
        
        path + "testMP3i18n.mp3", 
        path + "testAIFF.aif", 
        path + "testFLAC.flac", 
//        path + "testFLAC.oga", 
//        path + "testVORBIS.ogg",  
        path + "testMP4.m4a", 
        path + "testWAV.wav", 
//        path + "testWMA.wma", 
        
        path + "testFLV.flv", 
//        path + "testWMV.wmv", 
        
        path + "testBMP.bmp", 
        path + "testPNG.png", 
        path + "testPSD.psd",        
        path + "testSVG.svg",  
        path + "testTIFF.tif",     

//        path + "test-documents.7z", 
//        path + "test-documents.cpio",
//        path + "test-documents.tar", 
//        path + "test-documents.tbz2", 
//        path + "test-documents.tgz",
//        path + "test-documents.zip",
//        path + "test-zip-of-zip.zip",
//        path + "testJAR.jar",
        
//        path + "testKML.kml", 
//        path + "testRDF.rdf", 
        path + "testVISIO.vsd",
//        path + "testWAR.war", 
//        path + "testWindows-x86-32.exe",
//        path + "testWINMAIL.dat", 
//        path + "testWMF.wmf", 
    };   
    testDocumentTypesInternal(files, expectedRecords, expectedRecordContents);
  }

  /**
   * Test that the ContentHandler properly strips the illegal characters
   */
  @Test
  public void testTransformValue() {
    String fieldName = "user_name";
    assertFalse("foobar".equals(getFoobarWithNonChars()));

    Metadata metadata = new Metadata();
    // load illegal char string into a metadata field and generate a new document,
    // which will cause the ContentHandler to be invoked.
    metadata.set(fieldName, getFoobarWithNonChars());
    StripNonCharSolrContentHandlerFactory contentHandlerFactory =
      new StripNonCharSolrContentHandlerFactory(ExtractionDateUtil.DEFAULT_DATE_FORMATS);
    IndexSchema schema = h.getCore().getLatestSchema();
    SolrContentHandler contentHandler =
      contentHandlerFactory.createSolrContentHandler(metadata, new MapSolrParams(new HashMap()), schema);
    SolrInputDocument doc = contentHandler.newDocument();
    String foobar = doc.getFieldValue(fieldName).toString();
    assertTrue("foobar".equals(foobar));
  }

  /**
   * Returns string "foobar" with illegal characters interspersed.
   */
  private String getFoobarWithNonChars() {
    char illegalChar = '\uffff';
    StringBuilder builder = new StringBuilder();
    builder.append(illegalChar).append(illegalChar).append("foo").append(illegalChar)
      .append(illegalChar).append("bar").append(illegalChar).append(illegalChar);
    return builder.toString();
  }

}