/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nutch.parse.mspowerpoint;
import java.util.Enumeration;
import java.util.Hashtable;
import java.util.List;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.poi.hdf.extractor.Utils;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.util.LittleEndian;
import org.apache.poi.util.StringUtil;
/**
* Listener to read the content of PowerPoint file and transfere it to the
* passed <code>StringBuffer</code>.
*
* @author Stephan Strittmatter - http://www.sybit.de
*
* @version 1.0
*
*/
class ContentReaderListener implements POIFSReaderListener {
private static final Log LOG = LogFactory.getLog(ContentReaderListener.class);
/** Buffer holding the content of the file */
protected final transient StringBuffer buf;
/**
* Constructs Listener to get content of PowerPoint file.
*
* @param content
* StringBuffer refereing the content of the PowerPoint file.
*/
public ContentReaderListener(final StringBuffer content) {
this.buf = content;
}
/**
* Reads the internal PowerPoint document stream.
*
* @see org.apache.poi.poifs.eventfilesystem.POIFSReaderListener#processPOIFSReaderEvent(org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent)
*/
public void processPOIFSReaderEvent(final POIFSReaderEvent event) {
if (event == null || event.getName() == null
|| !event.getName().startsWith(PPTConstants.POWERPOINT_DOCUMENT)) {
if (LOG.isWarnEnabled()) {
LOG.warn("Stream not processed. It is not a PowerPoint document: : "
+ event.getName());
}
return;
}
try {
final DocumentInputStream dis = event.getStream();
final byte pptdata[] = new byte[dis.available()];
dis.read(pptdata, 0, dis.available());
int offset = 0;
long offsetPD = 0;
/*
* Traverse Bytearray to get CurrentUserEditAtom Call to extract the Text
* in all PlaceHolders to hold PPTClientTextBox objects for mapping into
* Slide Objects
*/
Hashtable/* <Long, TextBox> */containerTextBox = new Hashtable/*
* <Long,
* TextBox>
*/();
// Traverse ByteArray to identiy edit paths of ClientTextBoxes
long n = pptdata.length - 20;
for (long i = 0; i < n; i++) {
final long type = LittleEndian.getUShort(pptdata, (int) i + 2);
// final long size = LittleEndian.getUInt(pptdata, (int) i + 4);
if (PPTConstants.PPT_ATOM_USEREDIT == type) {
/*
* Checking the Record Header (UserEditAtom)
*/
// final long lastSlideID = LittleEndian.getInt(pptdata, (int) i + 8);
// final long version = LittleEndian.getUInt(pptdata, (int) i + 12);
offset = (int) LittleEndian.getUInt(pptdata, (int) i + 16);
offsetPD = LittleEndian.getUInt(pptdata, (int) i + 20);
/*
* Call to extract ClientTextBox text in each UserEditAtom
*/
containerTextBox = extractTextBoxes(containerTextBox, offset,
pptdata, offsetPD);
} else if (PPTConstants.PPT_ATOM_DRAWINGGROUP == type) {
// if (LOG.isTraceEnabled()) {
// LOG.trace("PPT_DRAWINGGROUP_ATOM ignored: " + type);
// }
} else if (PPTConstants.PPT_ATOM_TEXTBYTE == type) {
// if (LOG.isTraceEnabled()) {
// LOG.trace("PPT_TEXTBYTE_ATOM ignored: " + type);
// }
} else if (PPTConstants.PPT_ATOM_TEXTCHAR == type) {
// if (LOG.isTraceEnabled()) {
// LOG.trace("PPT_TEXTCHAR_ATOM ignored: " + type);
// }
} else {
// no action
// if (LOG.isTraceEnabled()) {
// LOG.trace("type not handled: " + type);
// }
}
}
final List/* <PPTSlide> */slides = extractSlides(offset, pptdata,
offsetPD);
if (slides.size() == 0) {
if (LOG.isInfoEnabled()) { LOG.info("No slides extracted!"); }
} else {
Slide slide = (Slide) slides.get(slides.size() - 1);
for (Enumeration enumeration = containerTextBox.elements(); enumeration
.hasMoreElements();) {
final TextBox textBox = (TextBox) enumeration.nextElement();
slide.addContent(textBox.getContent());
}
/*
* Merging TextBox data with Slide Data Printing the text from Slides
* vector object.
*/
List scontent;
for (int i = 0; i < slides.size(); i++) {
slide = (Slide) slides.get(i);
scontent = slide.getContent();
String contentText;
for (int j = 0; j < scontent.size(); j++) {
contentText = scontent.get(j).toString();
this.buf.append(contentText);
// to avoid concatinated words we add a blank additional
if (contentText.length() > 0
&& !(contentText.endsWith("\r") || contentText.endsWith("\n"))) {
this.buf.append(" ");
}
}
}
}
} catch (Throwable ex) {
// because of not killing complete crawling all Throwables are catched.
if (LOG.isErrorEnabled()) { LOG.error("processPOIFSReaderEvent", ex); }
}
}
/**
* Extracts the client text boxes of a slide.
*
* @param containerTextBox
* @param offset
* @param pptdata
* @param offsetPD
* @return Hashtable
* @see TextBox
*/
protected Hashtable/* <Long, TextBox> */extractTextBoxes(
final Hashtable/* <Long, TextBox> */containerTextBox, final int offset,
final byte[] pptdata, final long offsetPD) {
// To hold temporary data
FilteredStringWriter outStream = new FilteredStringWriter();
TextBox textBox;
// Traversing the bytearray up to Presist directory position
for (int i = offset; i < offsetPD - 20; i++) {
try {
// Record info
// final long rinfo = LittleEndian.getUShort(pptdata, (int) i);
// Record Type
final long recordType = LittleEndian.getUShort(pptdata, i + 2);
// Record Size
final long recordSize = LittleEndian.getUInt(pptdata, i + 4);
if (recordType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
/*
* Record type is of Drawing Group
*/
// Total number of objects
// final long objectCount = LittleEndian.getUInt(pptdata, (int) i +
// 8);
// currentID = Group ID+number of objects
long currentID = LittleEndian.getInt(pptdata, i + 12);
currentID = ((int) (currentID / 1024)) * 1024;
if (currentID == PPTConstants.PPT_MASTERSLIDE) {
// Ignore Master Slide objects
if (LOG.isTraceEnabled()) { LOG.trace("Ignore master slide."); }
i++;
continue;
}
// Check for the ClientTextBox GroupID existence
if (containerTextBox.containsKey(new Long(currentID))) {
// If exists get Client Textbox Group
textBox = (TextBox) containerTextBox.get(new Long(currentID));
textBox.setContent("");
} else {
textBox = new TextBox(currentID);
containerTextBox.put(new Long(currentID), textBox);
}
/*
* Iterating the bytearray for TextCharAtoms and TextBytesAtom
*/
if ((offsetPD - 20) != recordSize) {
// TODO something wrong? Probably an OLE-Object, which we ignore.
if (LOG.isDebugEnabled()) {
LOG.debug("offsetPD - 20=" + (offsetPD - 20) + " recordsize="
+ recordSize);
}
} else {
for (int startPos = i + 8; startPos < offsetPD - 20
&& startPos < recordSize; startPos++) { // && startPos <
// recordSize??
try {
// Record info
// final long nrinfo = LittleEndian.getUShort(pptdata, (int) j);
// Record Type
final long ntype = LittleEndian
.getUShort(pptdata, startPos + 2);
// Record size
// Note that the size doesn't include the 8 byte atom header
final long nsize = LittleEndian.getUInt(pptdata, startPos + 4);
if (ntype == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
/*
* Break the loop if next GroupID found
*/
i = startPos - 1;
break;
} else if (ntype == PPTConstants.PPT_ATOM_TEXTBYTE) {
// TextByteAtom record
outStream = new FilteredStringWriter();
long ii = 0;
for (ii = startPos + 6; ii <= startPos + 6 + nsize; ii++) {
// For loop to changed to a function
// if ((ii + 2) >= pptdata.length)
// break; // FIXME
outStream.write((char) (pptdata[(int) ii + 2]));
}
// Setting the identified text for Current
// groupID
textBox.setContent(textBox.getContent()
+ outStream.toString());
} else if (ntype == PPTConstants.PPT_ATOM_TEXTCHAR) {
// TextCharAtom record
final String strTempContent = new String(pptdata,
startPos + 6, (int) (nsize) + 2);
final byte bytes[] = strTempContent.getBytes();
if (true) {
outStream = new FilteredStringWriter();
for (int ii = 0; ii < bytes.length - 1; ii += 2) {
// For loop to changed to a function
outStream.write((char) (pptdata[ii + 2]));
}
textBox.setContent(textBox.getContent()
+ outStream.toString());
} else {
// this version is used within POI
String text = StringUtil.getFromCompressedUnicode(bytes, 0,
bytes.length);
textBox.setContent(textBox.getContent() + text);
}
} else {
// ignored
// if (LOG.isTraceEnabled()) {
// LOG.trace("Ignored atom type: " + type);
// }
}
} catch (Throwable e) {
if (LOG.isErrorEnabled()) { LOG.error("extractTextBoxes", e); }
break;
}
}
}
} else {
// Record type is ignored
// if (LOG.isTraceEnabled()) {
// LOG.trace("Ignored record type: " + type);
// }
}
} catch (Throwable ee) {
if (LOG.isErrorEnabled()) { LOG.error("extractClientTextBoxes", ee); }
break;
}
}
return containerTextBox;
}
/**
* Returns the Powerpoint <code>Slide</code> s of document as vector.
*
* @param offset
* @param pptdata
* @param offsetPD
* @return Vector of the powerpoint slides. Contains
* <code>{@link Slide Slide}</code>
* @see Slide
*/
protected List /* <Slide> */extractSlides(final long offset,
final byte[] pptdata, final long offsetPD) {
int sNum = 0;
// List of all slides found
final List/* <Slide> */slides = new Vector/* <Slide> */();
// current slide data
Slide currentSlide = null;
// To store data found in TextCharAtoms and TextBytesAtoms
FilteredStringWriter outStream;
for (long i = offset; i < pptdata.length - 20; i++) {
final long recordInfo = LittleEndian.getUShort(pptdata, (int) i);
final long atomType = LittleEndian.getUShort(pptdata, (int) i + 2);
final long atomSize = LittleEndian.getUInt(pptdata, (int) i + 4);
if (atomType == PPTConstants.PPT_ATOM_TEXTBYTE) {
/*
* TextByteAtom record
*/
outStream = new FilteredStringWriter();
for (long ii = i + 6; (ii <= i + 6 + atomSize)
&& (ii + 2 < pptdata.length); ii++) {
try {
// if(ii+2 >= pptdata.length) break; //FIXME
byte value = pptdata[(int) ii + 2];
outStream.write(value);
} catch (ArrayIndexOutOfBoundsException ex) {
if (LOG.isTraceEnabled()) { LOG.trace("size=" + pptdata.length); }
if (LOG.isErrorEnabled()) { LOG.error("extractSlides", ex); }
}
}
// Setting the identified text for Current Slide
if (currentSlide != null) {
currentSlide.addContent(outStream.toString());
}
} else if (atomType == PPTConstants.PPT_ATOM_TEXTCHAR) {
/*
* TextCharAtom record
*/
outStream = new FilteredStringWriter();
final String strTempContent = new String(pptdata, (int) i + 6,
(int) (atomSize) + 2);
final byte bytes[] = strTempContent.getBytes();
for (int ii = 0; ii < bytes.length - 1; ii += 2) {
outStream.write(Utils.getUnicodeCharacter(bytes, ii));
}
// Setting the identified text for Current Slide
if (currentSlide != null) {
currentSlide.addContent(outStream.toString());
}
} else if (atomType == PPTConstants.PPT_ATOM_SLIDEPERSISTANT) {
/*
* SlidePresistAtom Record
*/
if (sNum != 0) {
outStream = new FilteredStringWriter();
final long slideID = LittleEndian.getUInt(pptdata, (int) i + 20);
currentSlide = new Slide(slideID);
// currentSlide.addContent(outStream.toString());
slides.add(currentSlide);
}
sNum++;
} else if (atomType == PPTConstants.PPT_ATOM_DRAWINGGROUP) {
/*
* Diagram records are ignored
*/
if (LOG.isTraceEnabled()) { LOG.trace("Drawing Groups are ignored."); }
break;
} else {
// ignored
// if (LOG.isTraceEnabled()) {
// LOG.trace("Unhandled atomType: " + atomType);
// }
}
}
return slides;
}
}