/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.mspowerpoint;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import junit.framework.TestCase;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
/**
* <p>
* Unit tests for MSPowerPointParser.
* </p>
* <p>
* Make sure sample files are copied to "test.data" as specified in
* ./src/plugin/parse-mspowerpoint/build.xml during plugin compilation. Check
* ./src/plugin/parse-mspowerpoint/sample/README.txt for what they are.
* </p>
*
* @author Stephan Strittmatter - http://www.sybit.de
*
* @version 1.0
*/
public class TestMSPowerPointParser extends TestCase {
private static final Log LOG = LogFactory.getLog(TestMSPowerPointParser.class);
private static final String CHARSET = "UTF-8";
private final static String LINE_SEPARATOR = System.getProperty("line.separator");
/** This system property is defined in ./src/plugin/build-plugin.xml */
private final static String SAMPLE_DIR = System.getProperty("test.data",
"build/parse-mspowerpoint/test/data");
private final File sampleDir = new File(SAMPLE_DIR);
/**
* Wether dumping the extracted data to file for visual checks.
*/
private final static boolean DUMP_TO_FILE = false;
private final File testFile;
private String urlString;
private Protocol protocol;
private Content content;
/**
*
* @param name
*/
public TestMSPowerPointParser(String name) {
super(name);
this.testFile = new File(this.sampleDir, "test.ppt");
}
/**
* @param file
*/
public TestMSPowerPointParser(File file) {
super();
this.testFile = file;
}
/**
* @see TestCase#setUp()
*/
protected void setUp() throws Exception {
super.setUp();
this.urlString = createUrl(this.testFile.getName());
System.out.println("Testing file: " + this.urlString + "...");
this.protocol =new ProtocolFactory(NutchConfiguration.create()).getProtocol(this.urlString);
this.content = this.protocol.getProtocolOutput(new Text(this.urlString), new CrawlDatum()).getContent();
}
/**
* @see TestCase#tearDown()
*/
protected void tearDown() throws Exception {
super.tearDown();
}
/**
* Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
* parsable without exceptions.
*
* @see #SAMPLE_DIR
* @throws Exception
*/
public void testContent() throws Exception {
Parse parse = new ParseUtil(NutchConfiguration.create())
.parseByExtensionId("parse-mspowerpoint", this.content)
.get(this.content.getUrl());
ParseData data = parse.getData();
String text = parse.getText();
assertTrue("No content extracted length ==0", text.length() > 0);
this.dumpToFile(this.testFile.getName(), data, text);
final FileExtensionFilter contentFilter = new FileExtensionFilter(
this.testFile.getName() + ".content");
final File[] contentFiles = this.sampleDir.listFiles(contentFilter);
if (contentFiles.length > 0) {
String testContent = this.fileToString(contentFiles[0]);
for (int i = 0; i < text.length(); i++) {
char parsedChar = text.charAt(i);
char testChar = testContent.charAt(i);
assertEquals("Wrong char at position [" + i + "]", "" + testChar, ""
+ parsedChar);
}
} else {
LOG.info("Comparison file for Content not available: "
+ this.testFile.getName() + ".content");
}
}
/**
* Testing all available ppt-docs stored in dir <code>SAMPLE_DIR</code> if
* parsable without exceptions.
*
* @see #SAMPLE_DIR
* @throws Exception
*/
public void testMeta() throws Exception {
Parse parse = new ParseUtil(NutchConfiguration.create())
.parseByExtensionId("parse-mspowerpoint", content)
.get(content.getUrl());
ParseData data = parse.getData();
final FileExtensionFilter titleFilter = new FileExtensionFilter(
this.testFile.getName() + ".meta");
final File[] titleFiles = this.sampleDir.listFiles(titleFilter);
if (titleFiles.length > 0) {
assertEquals("Document Title", this.fileToString(titleFiles[0]),
"Title: " + data.getTitle() + LINE_SEPARATOR +
"Outlinks: " + data.getOutlinks().length + LINE_SEPARATOR);
} else {
assertTrue("Document Title length ==0", data.getTitle().length() > 0);
LOG.info("Comparison file for Title not available: "
+ this.testFile.getName() + ".meta");
}
}
/**
* create complete url
*
* @param fileName
* name of the file
* @return complete url.
*/
private String createUrl(final String fileName) {
return "file:" + SAMPLE_DIR + "/" + fileName;
}
/**
* Dump the parsed data to a UTF-8 formatted file for visual checks.
*
* @param data
* @param text
* @param fileName
* @throws IOException
*/
private void dumpToFile(final String fileName, final ParseData data,
final String text) throws IOException {
if (TestMSPowerPointParser.DUMP_TO_FILE) {
final File file = new File(fileName + ".txt");
final FileOutputStream fos = new FileOutputStream(file);
final OutputStreamWriter osw = new OutputStreamWriter(fos, CHARSET);
osw.write(data.toString());
osw.write(text);
osw.close();
fos.close();
}
}
/**
* Load the testfiles for comparison.
*
* @param file
* file to load
* @return UNF-8 encoded String content of file.
* @throws IOException
*/
private String fileToString(final File file) throws IOException {
FileInputStream fis = null;
//InputStreamReader isr = null;
BufferedReader br = null;
final StringBuffer buf = new StringBuffer();
try {
fis = new FileInputStream(file);
br = new BufferedReader(new InputStreamReader(fis, CHARSET));
String line = br.readLine();
while (line != null) {
buf.append(line).append(LINE_SEPARATOR);
line = br.readLine();
}
} finally {
if (br != null) {
br.close();
}
if (fis != null) {
fis.close();
}
}
String val = buf.toString();
return val;
}
}