/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.rtf;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.StringWriter;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.Tika;
import org.apache.tika.TikaTest;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.ParserContainerExtractor;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
import org.apache.tika.metadata.RTFMetadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.RecursiveParserWrapper;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
import org.junit.Test;
import org.xml.sax.ContentHandler;
/**
* Junit test class for the Tika {@link RTFParser}
*/
public class RTFParserTest extends TikaTest {
private Tika tika = new Tika();
@Test
public void testBasicExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTF.rtf");
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
tika.getParser().parse(
new FileInputStream(file),
new WriteOutContentHandler(writer),
metadata,
new ParseContext());
String content = writer.toString();
assertEquals("application/rtf", metadata.get(Metadata.CONTENT_TYPE));
assertEquals(1, metadata.getValues(Metadata.CONTENT_TYPE).length);
assertContains("Test", content);
assertContains("indexation Word", content);
}
@Test
public void testUmlautSpacesExtraction2() throws Exception {
String content = getText("testRTFUmlautSpaces2.rtf");
content = content.replaceAll("\\s+", "");
assertEquals("\u00DCbersicht", content);
}
@Test
public void testUnicodeUCNControlWordCharacterDoublingExtraction() throws Exception {
String content = getText("testRTFUnicodeUCNControlWordCharacterDoubling.rtf");
assertContains("\u5E74", content);
assertContains("\u5ff5", content);
assertContains("0 ", content);
assertContains("abc", content);
assertFalse("Doubled character \u5E74", content.contains("\u5E74\u5E74"));
}
@Test
public void testHexEscapeInsideWord() throws Exception {
String content = getText("testRTFHexEscapeInsideWord.rtf");
assertContains("ESP\u00cdRITO", content);
}
@Test
public void testWindowsCodepage1250() throws Exception {
String content = getText("testRTFWindowsCodepage1250.rtf");
assertContains("za\u017c\u00f3\u0142\u0107 g\u0119\u015bl\u0105 ja\u017a\u0144", content);
assertContains("ZA\u017b\u00d3\u0141\u0106 G\u0118\u015aL\u0104 JA\u0179\u0143", content);
}
@Test
public void testTableCellSeparation() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFTableCellSeparation.rtf");
String content = tika.parseToString(file);
content = content.replaceAll("\\s+", " ");
assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
assertContains("a b c d \u00E4 \u00EB \u00F6 \u00FC", content);
}
@Test
public void testTableCellSeparation2() throws Exception {
String content = getText("testRTFTableCellSeparation2.rtf");
// TODO: why do we insert extra whitespace...?
content = content.replaceAll("\\s+", " ");
assertContains("Station Fax", content);
}
@Test
public void testWordPadCzechCharactersExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFWordPadCzechCharacters.rtf");
String s1 = tika.parseToString(file);
assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
}
@Test
public void testWord2010CzechCharactersExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFWord2010CzechCharacters.rtf");
String s1 = tika.parseToString(file);
assertTrue(s1.contains("\u010Cl\u00E1nek t\u00FDdne"));
assertTrue(s1.contains("starov\u011Bk\u00E9 \u017Eidovsk\u00E9 n\u00E1bo\u017Eensk\u00E9 texty"));
}
@Test
public void testMS932Extraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTF-ms932.rtf");
String s1 = tika.parseToString(file);
// Hello in Japanese
assertTrue(s1.contains("\u3053\u3093\u306b\u3061\u306f"));
// Verify title, since it was also encoded with MS932:
Result r = getResult("testRTF-ms932.rtf");
assertEquals("\u30bf\u30a4\u30c8\u30eb", r.metadata.get(TikaCoreProperties.TITLE));
}
@Test
public void testUmlautSpacesExtraction() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFUmlautSpaces.rtf");
String s1 = tika.parseToString(file);
assertTrue(s1.contains("\u00DCbersicht"));
}
@Test
public void testGothic() throws Exception {
String content = getText("testRTFUnicodeGothic.rtf");
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
}
@Test
public void testJapaneseText() throws Exception {
Result r = getResult("testRTFJapanese.rtf");
String content = r.text;
// Verify title -- this title uses upr escape inside
// title info field:
assertEquals("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f\u3000",
r.metadata.get(TikaCoreProperties.TITLE));
assertEquals("VMazel", r.metadata.get(TikaCoreProperties.CREATOR));
assertEquals("VMazel", r.metadata.get(Metadata.AUTHOR));
assertEquals("StarWriter", r.metadata.get(TikaCoreProperties.COMMENTS));
// Special version of (GHQ)
assertContains("\uff08\uff27\uff28\uff31\uff09", content);
// 6 other characters
assertContains("\u6771\u4eac\u90fd\u4e09\u9df9\u5e02", content);
}
@Test
public void testMaxLength() throws Exception {
File file = getResourceAsFile("/test-documents/testRTFJapanese.rtf");
Metadata metadata = new Metadata();
InputStream stream = TikaInputStream.get(file, metadata);
// Test w/ default limit:
Tika localTika = new Tika();
String content = localTika.parseToString(stream, metadata);
// parseToString closes for convenience:
//stream.close();
assertTrue(content.length() > 500);
// Test setting max length on the instance:
localTika.setMaxStringLength(200);
stream = TikaInputStream.get(file, metadata);
content = localTika.parseToString(stream, metadata);
// parseToString closes for convenience:
//stream.close();
assertTrue(content.length() <= 200);
// Test setting max length per-call:
stream = TikaInputStream.get(file, metadata);
content = localTika.parseToString(stream, metadata, 100);
// parseToString closes for convenience:
//stream.close();
assertTrue(content.length() <= 100);
}
@Test
public void testTextWithCurlyBraces() throws Exception {
String content = getText("testRTFWithCurlyBraces.rtf");
assertContains("{ some text inside curly brackets }", content);
}
@Test
public void testControls() throws Exception {
Result r = getResult("testRTFControls.rtf");
String content = r.text;
assertContains("Thiswordhasanem\u2014dash", content);
assertContains("Thiswordhasanen\u2013dash", content);
assertContains("Thiswordhasanon\u2011breakinghyphen", content);
assertContains("Thiswordhasanonbreaking\u00a0space", content);
assertContains("Thiswordhasanoptional\u00adhyphen", content);
assertContains("\u2018Single quoted text\u2019", content);
assertContains("\u201cDouble quoted text\u201d", content);
assertContains("\u201cDouble quoted text again\u201d", content);
}
@Test
public void testInvalidUnicode() throws Exception {
Result r = getResult("testRTFInvalidUnicode.rtf");
String content = r.text;
assertContains("Unpaired hi \ufffd here", content);
assertContains("Unpaired lo \ufffd here", content);
assertContains("Mismatched pair \ufffd\ufffd here", content);
}
@Test
public void testVarious() throws Exception {
Result r = getResult("testRTFVarious.rtf");
String content = r.text;
assertContains("Footnote appears here", content);
assertContains("This is a footnote.", content);
assertContains("This is the header text.", content);
assertContains("This is the footer text.", content);
assertContains("Here is a text box", content);
assertContains("Bold", content);
assertContains("italic", content);
assertContains("underline", content);
assertContains("superscript", content);
assertContains("subscript", content);
assertContains("Here is a citation:", content);
assertContains("Figure 1 This is a caption for Figure 1", content);
assertContains("(Kramer)", content);
// Table
assertContains("Row 1 Col 1 Row 1 Col 2 Row 1 Col 3 Row 2 Col 1 Row 2 Col 2 Row 2 Col 3", content.replaceAll("\\s+", " "));
// 2-columns
assertContains("Row 1 column 1 Row 2 column 1 Row 1 column 2 Row 2 column 2", content.replaceAll("\\s+", " "));
assertContains("This is a hyperlink", content);
assertContains("Here is a list:", content);
for (int row = 1; row <= 3; row++) {
assertContains("Bullet " + row, content);
}
assertContains("Here is a numbered list:", content);
for (int row = 1; row <= 3; row++) {
assertContains("Number bullet " + row, content);
}
for (int row = 1; row <= 2; row++) {
for (int col = 1; col <= 3; col++) {
assertContains("Row " + row + " Col " + col, content);
}
}
assertContains("Keyword1 Keyword2", content);
assertEquals("Keyword1 Keyword2",
r.metadata.get(TikaCoreProperties.KEYWORDS));
assertContains("Subject is here", content);
assertEquals("Subject is here",
r.metadata.get(OfficeOpenXMLCore.SUBJECT));
assertEquals("Subject is here",
r.metadata.get(Metadata.SUBJECT));
assertContains("Suddenly some Japanese text:", content);
// Special version of (GHQ)
assertContains("\uff08\uff27\uff28\uff31\uff09", content);
// 6 other characters
assertContains("\u30be\u30eb\u30b2\u3068\u5c3e\u5d0e\u3001\u6de1\u3005\u3068\u6700\u671f", content);
assertContains("And then some Gothic text:", content);
assertContains("\uD800\uDF32\uD800\uDF3f\uD800\uDF44\uD800\uDF39\uD800\uDF43\uD800\uDF3A", content);
}
@Test
public void testVariousStyle() throws Exception {
String content = getXML("testRTFVarious.rtf").xml;
assertContains("<b>Bold</b>", content);
assertContains("<i>italic</i>", content);
}
@Test
public void testBoldItalic() throws Exception {
String content = getXML("testRTFBoldItalic.rtf").xml;
assertContains("<b>bold</b>", content);
assertContains("<b>bold </b><b><i>italic</i></b>", content);
assertContains("<b><i>italic </i></b><b>bold</b>", content);
assertContains("<i>italic</i>", content);
assertContains("<b>bold then </b><b><i>italic then</i></b><i> not bold</i>", content);
assertContains("<i>italic then </i><b><i>bold then</i></b><b> not italic</b>", content);
}
@Test
public void testHyperlink() throws Exception {
String content = getXML("testRTFHyperlink.rtf").xml;
assertContains("our most <a href=\"http://r.office.microsoft.com/r/rlidwelcomeFAQ?clid=1033\">frequently asked questions</a>", content);
assertEquals(-1, content.indexOf("<p>\t\t</p>"));
}
@Test
public void testIgnoredControlWord() throws Exception {
assertContains("<p>The quick brown fox jumps over the lazy dog</p>", getXML("testRTFIgnoredControlWord.rtf").xml);
}
@Test
public void testFontAfterBufferedText() throws Exception {
assertContains("\u0423\u0432\u0430\u0436\u0430\u0435\u043c\u044b\u0439 \u043a\u043b\u0438\u0435\u043d\u0442!",
getXML("testFontAfterBufferedText.rtf").xml);
}
@Test
public void testListMicrosoftWord() throws Exception {
String content = getXML("testRTFListMicrosoftWord.rtf").xml;
assertContains("<ol>\t<li>one</li>", content);
assertContains("</ol>", content);
assertContains("<ul>\t<li>first</li>", content);
assertContains("</ul>", content);
}
@Test
public void testListLibreOffice() throws Exception {
String content = getXML("testRTFListLibreOffice.rtf").xml;
assertContains("<ol>\t<li>one</li>", content);
assertContains("</ol>", content);
assertContains("<ul>\t<li>first</li>", content);
assertContains("</ul>", content);
}
// TIKA-782
@Test
public void testBinControlWord() throws Exception {
ByteCopyingHandler embHandler = new ByteCopyingHandler();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, embHandler);
}
assertEquals(1, embHandler.bytes.size());
byte[] bytes = embHandler.bytes.get(0);
assertEquals(10, bytes.length);
//}
assertEquals(125, (int) bytes[4]);
//make sure that at least the last value is correct
assertEquals(-1, (int) bytes[9]);
}
// TIKA-999
@Test
public void testMetaDataCounts() throws Exception {
XMLResult xml = getXML("testRTFWord2010CzechCharacters.rtf");
assertEquals("1", xml.metadata.get(Office.PAGE_COUNT));
assertEquals("70", xml.metadata.get(Office.WORD_COUNT));
assertEquals("401", xml.metadata.get(Office.CHARACTER_COUNT));
assertTrue(xml.metadata.get(Office.CREATION_DATE).startsWith("2010-10-13T"));
}
// TIKA-1192
@Test
public void testListOverride() throws Exception {
Result r = getResult("testRTFListOverride.rtf");
String content = r.text;
assertContains("Body", content);
}
// TIKA-1305
@Test
public void testCorruptListOverride() throws Exception {
Result r = getResult("testRTFCorruptListOverride.rtf");
String content = r.text;
assertContains("apple", content);
}
// TIKA-1010
@Test
public void testEmbeddedMonster() throws Exception {
Map<Integer, Pair> expected = new HashMap<>();
expected.put(3, new Pair("Hw.txt","text/plain; charset=ISO-8859-1"));
expected.put(4, new Pair("file_0.doc", "application/msword"));
expected.put(7, new Pair("file_1.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
expected.put(10, new Pair("text.html", "text/html; charset=windows-1252"));
expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
expected.put(12, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip"));
expected.put(15, new Pair("testHTML_utf8_\u666E\u6797\u65AF\u987F.html", "text/html; charset=UTF-8"));
expected.put(18, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
expected.put(21, new Pair("file_2.xls", "application/vnd.ms-excel"));
expected.put(24, new Pair("testMSG_\u666E\u6797\u65AF\u987F.msg", "application/vnd.ms-outlook"));
expected.put(27, new Pair("file_3.pdf", "application/pdf"));
expected.put(30, new Pair("file_4.ppt", "application/vnd.ms-powerpoint"));
expected.put(34, new Pair("file_5.pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"));
expected.put(33, new Pair("thumbnail.jpeg", "image/jpeg"));
expected.put(37, new Pair("file_6.doc", "application/msword"));
expected.put(40, new Pair("file_7.doc", "application/msword"));
expected.put(43, new Pair("file_8.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"));
expected.put(46, new Pair("testJPEG_\u666E\u6797\u65AF\u987F.jpg", "image/jpeg"));
List<Metadata> metadataList = getRecursiveMetadata("testRTFEmbeddedFiles.rtf");
assertEquals(49, metadataList.size());
for (Map.Entry<Integer, Pair> e : expected.entrySet()) {
Metadata metadata = metadataList.get(e.getKey());
Pair p = e.getValue();
assertNotNull(metadata.get(Metadata.RESOURCE_NAME_KEY));
//necessary to getName() because MSOffice extractor includes
//directory: _1457338524/HW.txt
assertEquals("filename equals ",
p.fileName, FilenameUtils.getName(
metadata.get(RecursiveParserWrapper.EMBEDDED_RESOURCE_PATH)));
assertEquals(p.mimeType, metadata.get(Metadata.CONTENT_TYPE));
}
assertEquals("C:\\Users\\tallison\\AppData\\Local\\Temp\\testJPEG_普林斯顿.jpg",
metadataList.get(46).get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME));
}
//TIKA-1010 test regular (not "embedded") images/picts
@Test
public void testRegularImages() throws Exception {
Parser base = new AutoDetectParser();
ParseContext ctx = new ParseContext();
RecursiveParserWrapper parser = new RecursiveParserWrapper(base,
new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.IGNORE, -1));
ContentHandler handler = new BodyContentHandler();
Metadata rootMetadata = new Metadata();
rootMetadata.add(Metadata.RESOURCE_NAME_KEY, "testRTFRegularImages.rtf");
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"))) {
parser.parse(tis, handler, rootMetadata, ctx);
}
List<Metadata> metadatas = parser.getMetadata();
Metadata meta_jpg_exif = metadatas.get(1);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
Metadata meta_jpg = metadatas.get(3);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
assertTrue(meta_jpg_exif != null);
assertTrue(meta_jpg != null);
assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
assertTrue(meta_jpg.get("Comments").contains("Licensed to the Apache"));
//make sure old metadata doesn't linger between objects
assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
assertEquals(49, meta_jpg.names().length);
assertEquals(113, meta_jpg_exif.names().length);
}
@Test
public void testMultipleNewlines() throws Exception {
String content = getXML("testRTFNewlines.rtf").xml;
content = content.replaceAll("[\r\n]+", " ");
assertContains("<body><p>one</p> " +
"<p /> " +
"<p>two</p> " +
"<p /> " +
"<p /> " +
"<p>three</p> " +
"<p /> " +
"<p /> " +
"<p /> " +
"<p>four</p>", content);
}
//TIKA-1010 test linked embedded doc
@Test
public void testEmbeddedLinkedDocument() throws Exception {
Set<MediaType> skipTypes = new HashSet<MediaType>();
skipTypes.add(MediaType.parse("image/emf"));
skipTypes.add(MediaType.parse("image/wmf"));
TrackingHandler tracker = new TrackingHandler(skipTypes);
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(0, tracker.filenames.size());
tracker = new TrackingHandler();
try (TikaInputStream tis = TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"))) {
ContainerExtractor ex = new ParserContainerExtractor();
assertEquals(true, ex.isSupported(tis));
ex.extract(tis, ex, tracker);
}
//should gracefully skip link and not throw NPE, IOEx, etc
assertEquals(2, tracker.filenames.size());
}
@Test
public void testConfig() throws Exception {
//test that memory allocation of the bin element is limited
//via the config file. Unfortunately, this test file's bin embedding contains 10 bytes
//so we had to set the config to 0.
InputStream is = getClass().getResourceAsStream("/org/apache/tika/parser/rtf/tika-config.xml");
assertNotNull(is);
TikaConfig tikaConfig = new TikaConfig(is);
Parser p = new AutoDetectParser(tikaConfig);
List<Metadata> metadataList = getRecursiveMetadata("testBinControlWord.rtf", p);
assertEquals(1, metadataList.size());
assertContains("TikaMemoryLimitException", metadataList.get(0).get(TikaCoreProperties.TIKA_META_EXCEPTION_EMBEDDED_STREAM));
}
private Result getResult(String filename) throws Exception {
File file = getResourceAsFile("/test-documents/" + filename);
Metadata metadata = new Metadata();
StringWriter writer = new StringWriter();
tika.getParser().parse(
new FileInputStream(file),
new WriteOutContentHandler(writer),
metadata,
new ParseContext());
String content = writer.toString();
return new Result(content, metadata);
}
private String getText(String filename) throws Exception {
return getResult(filename).text;
}
private static class Result {
public final String text;
public final Metadata metadata;
public Result(String text, Metadata metadata) {
this.text = text;
this.metadata = metadata;
}
}
private static class Pair {
final String fileName;
final String mimeType;
Pair(String fileName, String mimeType) {
this.fileName = fileName;
this.mimeType = mimeType;
}
}
}