/* Copyright 2004 Ryan Ackley * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package org.apache.nutch.parse.msword; import org.apache.nutch.parse.msword.chp.*; import org.apache.poi.util.LittleEndian; import org.apache.poi.hwpf.model.*; import java.util.*; /** * This class is used to extract text from Word 6 documents only. It should * only be called from the org.textmining.text.extraction.WordExtractor because * it will automatically determine the version. * * @author Ryan Ackley */ class Word6Extractor { public Word6Extractor() { } /** * Extracts the text * * @param mainStream The POIFS document stream entitled "WordDocument". * * @return The text from the document * @throws Exception If there are any unexpected exceptions. */ public String extractText(byte[] mainStream) throws Exception { int fcMin = LittleEndian.getInt(mainStream, 0x18); int fcMax = LittleEndian.getInt(mainStream, 0x1C); int chpTableOffset = LittleEndian.getInt(mainStream, 0xb8); int chpTableSize = LittleEndian.getInt(mainStream, 0xbc); // get a list of character properties Word6CHPBinTable chpTable = new Word6CHPBinTable(mainStream, chpTableOffset, chpTableSize, fcMin, new TextPieceTable()); List textRuns = chpTable.getTextRuns(); // iterate through the WordTextBuffer finalTextBuf = new WordTextBuffer(); Iterator runsIt = textRuns.iterator(); while(runsIt.hasNext()) { CHPX chpx = (CHPX)runsIt.next(); int runStart = chpx.getStart() + fcMin; int runEnd = chpx.getEnd() + fcMin; if (!isDeleted(chpx.getGrpprl())) { String s = new String(mainStream, runStart, Math.min(runEnd, fcMax) - runStart, "Cp1252"); finalTextBuf.append(s); if (runEnd >= fcMax) { break; } } } return finalTextBuf.toString(); } /** * Used to determine if a run of text has been deleted. * @param grpprl The list of sprms for this run of text. * @return */ private boolean isDeleted(byte[] grpprl) { int offset = 0; boolean deleted = false; while (offset < grpprl.length) { switch (LittleEndian.getUnsignedByte(grpprl, offset++)) { case 65: deleted = grpprl[offset++] != 0; break; case 66: offset++; break; case 67: offset++; break; case 68: offset += grpprl[offset]; break; case 69: offset += 2; break; case 70: offset += 4; break; case 71: offset++; break; case 72: offset += 2; break; case 73: offset += 3; break; case 74: offset += grpprl[offset]; break; case 75: offset++; break; case 80: offset += 2; break; case 81: offset += grpprl[offset]; break; case 82: offset += grpprl[offset]; break; case 83: break; case 85: offset++; break; case 86: offset++; break; case 87: offset++; break; case 88: offset++; break; case 89: offset++; break; case 90: offset++; break; case 91: offset++; break; case 92: offset++; break; case 93: offset += 2; break; case 94: offset++; break; case 95: offset += 3; break; case 96: offset += 2; break; case 97: offset += 2; break; case 98: offset++; break; case 99: offset++; break; case 100: offset++; break; case 101: offset++; break; case 102: offset++; break; case 103: offset += grpprl[offset]; break; case 104: offset++; break; case 105: offset += grpprl[offset]; break; case 106: offset += grpprl[offset]; break; case 107: offset += 2; break; case 108: offset += grpprl[offset]; break; case 109: offset += 2; break; case 110: offset += 2; break; case 117: offset++; break; case 118: offset++; break; } } return deleted; } }