/** * */ package uk.bl.wa.solr; /* * #%L * warc-indexer * %% * Copyright (C) 2013 - 2015 The UK Web Archive * %% * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as * published by the Free Software Foundation, either version 2 of the * License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this program. If not, see * <http://www.gnu.org/licenses/gpl-2.0.html>. * #L% */ import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; import java.io.File; import java.net.URL; import org.junit.Before; import org.junit.Test; /** * @author Andrew Jackson <Andrew.Jackson@bl.uk> * */ public class TikaExtractorTest { private TikaExtractor tika; /** * @throws java.lang.Exception */ @Before public void setUp() throws Exception { tika = new TikaExtractor(); } @Test public void testMonaLisa() throws Exception { File ml = new File( "src/test/resources/wikipedia-mona-lisa/Mona_Lisa.html"); URL url = ml.toURI().toURL(); SolrRecord solr = new SolrRecord(); tika.extract(solr, url.openStream(), url.toString()); System.out.println("SOLR " + solr.getSolrDocument().toString()); String text = (String) solr.getField(SolrFields.SOLR_EXTRACTED_TEXT) .getValue(); assertTrue("Text should contain this string!", text.contains("Mona Lisa")); assertFalse( "Text should NOT contain this string! (implies bad newline handling)", text.contains("encyclopediaMona")); } }