Java Examples for org.apache.poi.hwpf.usermodel.CharacterRun

The following java examples will help you to understand the usage of org.apache.poi.hwpf.usermodel.CharacterRun. These source code samples are taken from different open source projects.

Example 1
Project: POI-Android-master  File: AbstractWordConverter.java View source code
protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
    if (range == null)
        return false;
    boolean haveAnyText = false;
    /*
         * In text there can be fields, bookmarks, may be other structures (code
         * below allows extension). Those structures can overlaps, so either we
         * should process char-by-char (slow) or find a correct way to
         * reconstruct the structure of range -- sergey
         */
    List<Structure> structures = new LinkedList<Structure>();
    if (wordDocument instanceof HWPFDocument) {
        final HWPFDocument doc = (HWPFDocument) wordDocument;
        Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
        if (rangeBookmarks != null) {
            for (List<Bookmark> lists : rangeBookmarks.values()) {
                for (Bookmark bookmark : lists) {
                    if (!bookmarkStack.contains(bookmark))
                        addToStructures(structures, new Structure(bookmark));
                }
            }
        }
        // TODO: dead fields?
        int skipUntil = -1;
        for (int c = 0; c < range.numCharacterRuns(); c++) {
            CharacterRun characterRun = range.getCharacterRun(c);
            if (characterRun == null)
                throw new AssertionError();
            if (characterRun.getStartOffset() < skipUntil)
                continue;
            String text = characterRun.text();
            if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
                continue;
            Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
            if (aliveField != null) {
                addToStructures(structures, new Structure(aliveField));
            } else {
                int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
                if (separatorEnd != null) {
                    addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
                    c = separatorEnd[1];
                }
            }
        }
    }
    structures = new ArrayList<Structure>(structures);
    Collections.sort(structures);
    int previous = range.getStartOffset();
    for (Structure structure : structures) {
        if (structure.start != previous) {
            Range subrange = new Range(previous, structure.start, range) {

                @Override
                public String toString() {
                    return "BetweenStructuresSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        if (structure.structure instanceof Bookmark) {
            // other bookmarks with same boundaries
            List<Bookmark> bookmarks = new LinkedList<Bookmark>();
            for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
                if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
                    bookmarks.add(bookmark);
                }
            }
            bookmarkStack.addAll(bookmarks);
            try {
                int end = Math.min(range.getEndOffset(), structure.end);
                Range subrange = new Range(structure.start, end, range) {

                    @Override
                    public String toString() {
                        return "BookmarksSubrange " + super.toString();
                    }
                };
                processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
            } finally {
                bookmarkStack.removeAll(bookmarks);
            }
        } else if (structure.structure instanceof Field) {
            Field field = (Field) structure.structure;
            processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
        } else if (structure.structure instanceof DeadFieldBoundaries) {
            DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
            processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
        } else {
            throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
        }
        previous = Math.min(range.getEndOffset(), structure.end);
    }
    if (previous != range.getStartOffset()) {
        if (previous > range.getEndOffset()) {
            logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
            return true;
        }
        if (previous < range.getEndOffset()) {
            Range subrange = new Range(previous, range.getEndOffset(), range) {

                @Override
                public String toString() {
                    return "AfterStructureSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        return true;
    }
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun == null)
            throw new AssertionError();
        if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
            HWPFDocument newFormat = (HWPFDocument) wordDocument;
            Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
            processImage(block, characterRun.text().charAt(0) == 0x01, picture);
            continue;
        }
        String text = characterRun.text();
        if (text.getBytes().length == 0)
            continue;
        if (characterRun.isSpecialCharacter()) {
            if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processNoteAnchor(doc, characterRun, block);
                continue;
            }
            if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processDrawnObject(doc, characterRun, block);
                continue;
            }
            if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processOle2(doc, characterRun, block);
                continue;
            }
        }
        if (text.getBytes()[0] == FIELD_BEGIN_MARK) {
            if (wordDocument instanceof HWPFDocument) {
                Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
                if (aliveField != null) {
                    processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
                    int continueAfter = aliveField.getFieldEndOffset();
                    while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
                    if (c < range.numCharacterRuns())
                        c--;
                    continue;
                }
            }
            int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
            if (skipTo != c) {
                c = skipTo;
                continue;
            }
            continue;
        }
        if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (text.getBytes()[0] == FIELD_END_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
            continue;
        }
        if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
            text = text.substring(0, text.length() - 1);
        {
            // line breaks
            StringBuilder stringBuilder = new StringBuilder();
            for (char charChar : text.toCharArray()) {
                if (charChar == 11) {
                    if (stringBuilder.length() > 0) {
                        outputCharacters(block, characterRun, stringBuilder.toString());
                        stringBuilder.setLength(0);
                    }
                    processLineBreak(block, characterRun);
                } else if (charChar == 30) {
                    // Non-breaking hyphens are stored as ASCII 30
                    stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
                } else if (charChar == 31) {
                    // Non-required hyphens to zero-width space
                    stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
                } else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
                    stringBuilder.append(charChar);
                }
            }
            if (stringBuilder.length() > 0) {
                outputCharacters(block, characterRun, stringBuilder.toString());
                stringBuilder.setLength(0);
            }
        }
        haveAnyText |= text.trim().length() != 0;
    }
    return haveAnyText;
}
Example 2
Project: poi-master  File: AbstractWordConverter.java View source code
protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
    if (range == null)
        return false;
    boolean haveAnyText = false;
    /*
         * In text there can be fields, bookmarks, may be other structures (code
         * below allows extension). Those structures can overlaps, so either we
         * should process char-by-char (slow) or find a correct way to
         * reconstruct the structure of range -- sergey
         */
    List<Structure> structures = new LinkedList<Structure>();
    if (wordDocument instanceof HWPFDocument) {
        final HWPFDocument doc = (HWPFDocument) wordDocument;
        Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
        if (rangeBookmarks != null) {
            for (List<Bookmark> lists : rangeBookmarks.values()) {
                for (Bookmark bookmark : lists) {
                    if (!bookmarkStack.contains(bookmark))
                        addToStructures(structures, new Structure(bookmark));
                }
            }
        }
        // TODO: dead fields?
        int skipUntil = -1;
        for (int c = 0; c < range.numCharacterRuns(); c++) {
            CharacterRun characterRun = range.getCharacterRun(c);
            if (characterRun == null)
                throw new AssertionError();
            if (characterRun.getStartOffset() < skipUntil)
                continue;
            String text = characterRun.text();
            if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
                continue;
            Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
            if (aliveField != null) {
                addToStructures(structures, new Structure(aliveField));
            } else {
                int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
                if (separatorEnd != null) {
                    addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
                    c = separatorEnd[1];
                }
            }
        }
    }
    structures = new ArrayList<Structure>(structures);
    Collections.sort(structures);
    int previous = range.getStartOffset();
    for (Structure structure : structures) {
        if (structure.start != previous) {
            Range subrange = new Range(previous, structure.start, range) {

                @Override
                public String toString() {
                    return "BetweenStructuresSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        if (structure.structure instanceof Bookmark) {
            // other bookmarks with same boundaries
            List<Bookmark> bookmarks = new LinkedList<Bookmark>();
            for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
                if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
                    bookmarks.add(bookmark);
                }
            }
            bookmarkStack.addAll(bookmarks);
            try {
                int end = Math.min(range.getEndOffset(), structure.end);
                Range subrange = new Range(structure.start, end, range) {

                    @Override
                    public String toString() {
                        return "BookmarksSubrange " + super.toString();
                    }
                };
                processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
            } finally {
                bookmarkStack.removeAll(bookmarks);
            }
        } else if (structure.structure instanceof Field) {
            Field field = (Field) structure.structure;
            processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
        } else if (structure.structure instanceof DeadFieldBoundaries) {
            DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
            processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
        } else {
            throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
        }
        previous = Math.min(range.getEndOffset(), structure.end);
    }
    if (previous != range.getStartOffset()) {
        if (previous > range.getEndOffset()) {
            logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
            return true;
        }
        if (previous < range.getEndOffset()) {
            Range subrange = new Range(previous, range.getEndOffset(), range) {

                @Override
                public String toString() {
                    return "AfterStructureSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        return true;
    }
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun == null)
            throw new AssertionError();
        if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
            HWPFDocument newFormat = (HWPFDocument) wordDocument;
            Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
            processImage(block, characterRun.text().charAt(0) == 0x01, picture);
            continue;
        }
        String text = characterRun.text();
        if (text.isEmpty())
            continue;
        if (characterRun.isSpecialCharacter()) {
            if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processNoteAnchor(doc, characterRun, block);
                continue;
            }
            if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processDrawnObject(doc, characterRun, block);
                continue;
            }
            if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processOle2(doc, characterRun, block);
                continue;
            }
            if (characterRun.isSymbol() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processSymbol(doc, characterRun, block);
                continue;
            }
        }
        if (text.charAt(0) == FIELD_BEGIN_MARK) {
            if (wordDocument instanceof HWPFDocument) {
                Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
                if (aliveField != null) {
                    processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
                    int continueAfter = aliveField.getFieldEndOffset();
                    while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
                    if (c < range.numCharacterRuns())
                        c--;
                    continue;
                }
            }
            int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
            if (skipTo != c) {
                c = skipTo;
                continue;
            }
            continue;
        }
        if (text.charAt(0) == FIELD_SEPARATOR_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (text.charAt(0) == FIELD_END_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
            continue;
        }
        if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
            text = text.substring(0, text.length() - 1);
        {
            // line breaks
            StringBuilder stringBuilder = new StringBuilder();
            for (char charChar : text.toCharArray()) {
                if (charChar == 11) {
                    if (stringBuilder.length() > 0) {
                        outputCharacters(block, characterRun, stringBuilder.toString());
                        stringBuilder.setLength(0);
                    }
                    processLineBreak(block, characterRun);
                } else if (charChar == 30) {
                    // Non-breaking hyphens are stored as ASCII 30
                    stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
                } else if (charChar == 31) {
                    // Non-required hyphens to zero-width space
                    stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
                } else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
                    stringBuilder.append(charChar);
                }
            }
            if (stringBuilder.length() > 0) {
                outputCharacters(block, characterRun, stringBuilder.toString());
                stringBuilder.setLength(0);
            }
        }
        haveAnyText |= text.trim().length() != 0;
    }
    return haveAnyText;
}
Example 3
Project: SimpleAndroidDocView-master  File: AbstractWordConverter.java View source code
protected boolean processCharacters(final HWPFDocumentCore wordDocument, final int currentTableLevel, final Range range, final Element block) {
    if (range == null)
        return false;
    boolean haveAnyText = false;
    /*
         * In text there can be fields, bookmarks, may be other structures (code
         * below allows extension). Those structures can overlaps, so either we
         * should process char-by-char (slow) or find a correct way to
         * reconstruct the structure of range -- sergey
         */
    List<Structure> structures = new LinkedList<Structure>();
    if (wordDocument instanceof HWPFDocument) {
        final HWPFDocument doc = (HWPFDocument) wordDocument;
        Map<Integer, List<Bookmark>> rangeBookmarks = doc.getBookmarks().getBookmarksStartedBetween(range.getStartOffset(), range.getEndOffset());
        if (rangeBookmarks != null) {
            for (List<Bookmark> lists : rangeBookmarks.values()) {
                for (Bookmark bookmark : lists) {
                    if (!bookmarkStack.contains(bookmark))
                        addToStructures(structures, new Structure(bookmark));
                }
            }
        }
        // TODO: dead fields?
        int skipUntil = -1;
        for (int c = 0; c < range.numCharacterRuns(); c++) {
            CharacterRun characterRun = range.getCharacterRun(c);
            if (characterRun == null)
                throw new AssertionError();
            if (characterRun.getStartOffset() < skipUntil)
                continue;
            String text = characterRun.text();
            if (text == null || text.length() == 0 || text.charAt(0) != FIELD_BEGIN_MARK)
                continue;
            Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
            if (aliveField != null) {
                addToStructures(structures, new Structure(aliveField));
            } else {
                int[] separatorEnd = tryDeadField_lookupFieldSeparatorEnd(wordDocument, range, c);
                if (separatorEnd != null) {
                    addToStructures(structures, new Structure(new DeadFieldBoundaries(c, separatorEnd[0], separatorEnd[1]), characterRun.getStartOffset(), range.getCharacterRun(separatorEnd[1]).getEndOffset()));
                    c = separatorEnd[1];
                }
            }
        }
    }
    structures = new ArrayList<Structure>(structures);
    Collections.sort(structures);
    int previous = range.getStartOffset();
    for (Structure structure : structures) {
        if (structure.start != previous) {
            Range subrange = new Range(previous, structure.start, range) {

                @Override
                public String toString() {
                    return "BetweenStructuresSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        if (structure.structure instanceof Bookmark) {
            // other bookmarks with same boundaries
            List<Bookmark> bookmarks = new LinkedList<Bookmark>();
            for (Bookmark bookmark : ((HWPFDocument) wordDocument).getBookmarks().getBookmarksStartedBetween(structure.start, structure.start + 1).values().iterator().next()) {
                if (bookmark.getStart() == structure.start && bookmark.getEnd() == structure.end) {
                    bookmarks.add(bookmark);
                }
            }
            bookmarkStack.addAll(bookmarks);
            try {
                int end = Math.min(range.getEndOffset(), structure.end);
                Range subrange = new Range(structure.start, end, range) {

                    @Override
                    public String toString() {
                        return "BookmarksSubrange " + super.toString();
                    }
                };
                processBookmarks(wordDocument, block, subrange, currentTableLevel, bookmarks);
            } finally {
                bookmarkStack.removeAll(bookmarks);
            }
        } else if (structure.structure instanceof Field) {
            Field field = (Field) structure.structure;
            processField((HWPFDocument) wordDocument, range, currentTableLevel, field, block);
        } else if (structure.structure instanceof DeadFieldBoundaries) {
            DeadFieldBoundaries boundaries = (DeadFieldBoundaries) structure.structure;
            processDeadField(wordDocument, block, range, currentTableLevel, boundaries.beginMark, boundaries.separatorMark, boundaries.endMark);
        } else {
            throw new UnsupportedOperationException("NYI: " + structure.structure.getClass());
        }
        previous = Math.min(range.getEndOffset(), structure.end);
    }
    if (previous != range.getStartOffset()) {
        if (previous > range.getEndOffset()) {
            logger.log(POILogger.WARN, "Latest structure in ", range, " ended at #" + previous, " after range boundaries [", range.getStartOffset() + "; " + range.getEndOffset(), ")");
            return true;
        }
        if (previous < range.getEndOffset()) {
            Range subrange = new Range(previous, range.getEndOffset(), range) {

                @Override
                public String toString() {
                    return "AfterStructureSubrange " + super.toString();
                }
            };
            processCharacters(wordDocument, currentTableLevel, subrange, block);
        }
        return true;
    }
    for (int c = 0; c < range.numCharacterRuns(); c++) {
        CharacterRun characterRun = range.getCharacterRun(c);
        if (characterRun == null)
            throw new AssertionError();
        if (wordDocument instanceof HWPFDocument && ((HWPFDocument) wordDocument).getPicturesTable().hasPicture(characterRun)) {
            HWPFDocument newFormat = (HWPFDocument) wordDocument;
            Picture picture = newFormat.getPicturesTable().extractPicture(characterRun, true);
            processImage(block, characterRun.text().charAt(0) == 0x01, picture);
            continue;
        }
        String text = characterRun.text();
        if (text.getBytes().length == 0)
            continue;
        if (characterRun.isSpecialCharacter()) {
            if (text.charAt(0) == SPECCHAR_AUTONUMBERED_FOOTNOTE_REFERENCE && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processNoteAnchor(doc, characterRun, block);
                continue;
            }
            if (text.charAt(0) == SPECCHAR_DRAWN_OBJECT && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processDrawnObject(doc, characterRun, block);
                continue;
            }
            if (characterRun.isOle2() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processOle2(doc, characterRun, block);
                continue;
            }
            if (characterRun.isSymbol() && (wordDocument instanceof HWPFDocument)) {
                HWPFDocument doc = (HWPFDocument) wordDocument;
                processSymbol(doc, characterRun, block);
                continue;
            }
        }
        if (text.getBytes()[0] == FIELD_BEGIN_MARK) {
            if (wordDocument instanceof HWPFDocument) {
                Field aliveField = ((HWPFDocument) wordDocument).getFields().getFieldByStartOffset(FieldsDocumentPart.MAIN, characterRun.getStartOffset());
                if (aliveField != null) {
                    processField(((HWPFDocument) wordDocument), range, currentTableLevel, aliveField, block);
                    int continueAfter = aliveField.getFieldEndOffset();
                    while (c < range.numCharacterRuns() && range.getCharacterRun(c).getEndOffset() <= continueAfter) c++;
                    if (c < range.numCharacterRuns())
                        c--;
                    continue;
                }
            }
            int skipTo = tryDeadField(wordDocument, range, currentTableLevel, c, block);
            if (skipTo != c) {
                c = skipTo;
                continue;
            }
            continue;
        }
        if (text.getBytes()[0] == FIELD_SEPARATOR_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (text.getBytes()[0] == FIELD_END_MARK) {
            // shall not appear without FIELD_BEGIN_MARK
            continue;
        }
        if (characterRun.isSpecialCharacter() || characterRun.isObj() || characterRun.isOle2()) {
            continue;
        }
        if (text.endsWith("\r") || (text.charAt(text.length() - 1) == BEL_MARK && currentTableLevel != Integer.MIN_VALUE))
            text = text.substring(0, text.length() - 1);
        {
            // line breaks
            StringBuilder stringBuilder = new StringBuilder();
            for (char charChar : text.toCharArray()) {
                if (charChar == 11) {
                    if (stringBuilder.length() > 0) {
                        outputCharacters(block, characterRun, stringBuilder.toString());
                        stringBuilder.setLength(0);
                    }
                    processLineBreak(block, characterRun);
                } else if (charChar == 30) {
                    // Non-breaking hyphens are stored as ASCII 30
                    stringBuilder.append(UNICODECHAR_NONBREAKING_HYPHEN);
                } else if (charChar == 31) {
                    // Non-required hyphens to zero-width space
                    stringBuilder.append(UNICODECHAR_ZERO_WIDTH_SPACE);
                } else if (charChar >= 0x20 || charChar == 0x09 || charChar == 0x0A || charChar == 0x0D) {
                    stringBuilder.append(charChar);
                }
            }
            if (stringBuilder.length() > 0) {
                outputCharacters(block, characterRun, stringBuilder.toString());
                stringBuilder.setLength(0);
            }
        }
        haveAnyText |= text.trim().length() != 0;
    }
    return haveAnyText;
}
Example 4
Project: step-master  File: ParseVersionFile.java View source code
public static void main(final String[] args) throws Exception {
    POIFSFileSystem fs = null;
    boolean isHidden = false;
    fs = new POIFSFileSystem(new FileInputStream("C:\\Users\\Chris\\Downloads\\Gen 1-10.prepared for reviewer.doc"));
    final HWPFDocument doc = new HWPFDocument(fs);
    // System.out.println("Word Document has " + paragraphs.length + " paragraphs");
    final Range range = doc.getRange();
    boolean prefix = false;
    boolean mainText = false;
    String currentRef = null;
    StringBuilder text = new StringBuilder(256);
    StringBuilder partialLine = new StringBuilder(256);
    int count = 0;
    for (int k = 0; k < range.numParagraphs(); k++) {
        final org.apache.poi.hwpf.usermodel.Paragraph paragraph = range.getParagraph(k);
        for (int j = 0; j < paragraph.numCharacterRuns(); j++) {
            final org.apache.poi.hwpf.usermodel.CharacterRun cr = paragraph.getCharacterRun(j);
            String docText = cr.text();
            if (cr.isVanished()) {
                if (!isHidden) {
                    // we only print out the last line of full text and of partial line...
                    final String fullText = text.toString();
                    final String[] lines = LINES.split(fullText);
                    String lastLine = lines[lines.length - 1];
                    final Matcher matcher = REF.matcher(lastLine);
                    final boolean foundRef = matcher.find();
                    if (foundRef) {
                        currentRef = matcher.group();
                        lastLine = lastLine.replaceAll(currentRef, "").trim();
                    }
                    System.out.println("===============================");
                    System.out.println("@Reference=\t" + currentRef);
                    System.out.println("@FullText=\t" + lastLine);
                    System.out.println("@MatchingText=\t" + partialLine.toString());
                    count = 0;
                    text = new StringBuilder(256);
                    partialLine = new StringBuilder(128);
                    isHidden = true;
                }
                if (cr.isBold()) {
                    // if we're looking at bold text, we need to output the prefix
                    if (!prefix) {
                        System.out.println(String.format("@OptionsType%d=\t%s", count, clean(text.toString())));
                        prefix = true;
                        text = new StringBuilder(256);
                    }
                } else if (!mainText && prefix) {
                    // no longer bold, but already have a prefix
                    mainText = true;
                    System.out.println(String.format("@OptionsAlternative%d=\t%s", count, clean(text.toString())));
                    text = new StringBuilder(256);
                    // deal with carriage returns differently
                    final int splitChar = hasCarriageReturn(docText);
                    if (splitChar != -1) {
                        // we've split to a new line
                        final String postfix = docText.substring(0, splitChar);
                        text.append(postfix);
                        if (isNotBlank(postfix)) {
                            final String clean = clean(text.toString());
                            if (isNotBlank(clean)) {
                                System.out.println(String.format("@OptionsQualifier%d=\t%s", count, clean));
                            }
                        }
                        prefix = false;
                        mainText = false;
                        count++;
                        text = new StringBuilder(256);
                        docText = docText.substring(splitChar);
                    }
                } else if (prefix && mainText) {
                    // have a prefix and a main text, and we're not bold, then we're either the
                    // postfix or prefix of the next entry
                    final int splitChar = hasCarriageReturn(docText);
                    if (splitChar != -1) {
                        // we've split to a new line
                        final String postfix = docText.substring(0, splitChar);
                        text.append(postfix);
                        if (isNotBlank(postfix)) {
                            final String clean = clean(text.toString());
                            if (isNotBlank(clean)) {
                                System.out.println(String.format("@OptionsQualifier%d=\t%s", count, clean));
                            }
                        }
                        prefix = false;
                        mainText = false;
                        count++;
                        text = new StringBuilder(256);
                        docText = docText.substring(splitChar);
                    }
                } else {
                    // deal with carriage returns differently
                    final int splitChar = hasCarriageReturn(docText);
                    if (splitChar != -1) {
                        // we've split to a new line
                        final String postfix = docText.substring(0, splitChar);
                        text.append(postfix);
                        if (isNotBlank(postfix)) {
                            final String clean = clean(text.toString());
                            if (isNotBlank(clean)) {
                                System.out.println(String.format("@OptionsQualifier%d=\t%s", count, clean));
                            }
                        }
                        prefix = false;
                        mainText = false;
                        count++;
                        text = new StringBuilder(256);
                        docText = docText.substring(splitChar);
                    }
                }
                text.append(docText);
            } else {
                if (isHidden) {
                    text = new StringBuilder(256);
                    prefix = false;
                    mainText = false;
                    isHidden = false;
                }
                if (cr.getUnderlineCode() != 0) {
                    partialLine.append(docText);
                }
                text.append(docText);
            }
        }
    }
}
Example 5
Project: OCRaptor-master  File: WordExtractor.java View source code
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    // into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");
                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }
    TagAndStyle tas;
    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }
    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }
    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);
        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes()[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_" + field.getMarkSeparatorCharacterRun(r).getPicOffset();
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }
        if (cr.text().equals("")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();
                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }
    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }
    xhtml.endElement(tas.getTag());
    return 0;
}
Example 6
Project: tika-master  File: WordExtractor.java View source code
private int handleParagraph(Paragraph p, int parentTableLevel, Range r, HWPFDocument document, FieldsDocumentPart docPart, PicturesSource pictures, PicturesTable pictureTable, ListManager listManager, XHTMLContentHandler xhtml) throws SAXException, IOException, TikaException {
    //  into nested tables, so currently we don't
    if (p.isInTable() && p.getTableLevel() > parentTableLevel && parentTableLevel == 0) {
        Table t = r.getTable(p);
        xhtml.startElement("table");
        xhtml.startElement("tbody");
        for (int rn = 0; rn < t.numRows(); rn++) {
            TableRow row = t.getRow(rn);
            xhtml.startElement("tr");
            for (int cn = 0; cn < row.numCells(); cn++) {
                TableCell cell = row.getCell(cn);
                xhtml.startElement("td");
                for (int pn = 0; pn < cell.numParagraphs(); pn++) {
                    Paragraph cellP = cell.getParagraph(pn);
                    handleParagraph(cellP, p.getTableLevel(), cell, document, docPart, pictures, pictureTable, listManager, xhtml);
                }
                xhtml.endElement("td");
            }
            xhtml.endElement("tr");
        }
        xhtml.endElement("tbody");
        xhtml.endElement("table");
        return (t.numParagraphs() - 1);
    }
    String text = p.text();
    if (text.replaceAll("[\\r\\n\\s]+", "").isEmpty()) {
        // Skip empty paragraphs
        return 0;
    }
    TagAndStyle tas;
    String numbering = null;
    if (document.getStyleSheet().numStyles() > p.getStyleIndex()) {
        StyleDescription style = document.getStyleSheet().getStyleDescription(p.getStyleIndex());
        if (style != null && style.getName() != null && style.getName().length() > 0) {
            if (p.isInList()) {
                numbering = listManager.getFormattedNumber(p);
            }
            tas = buildParagraphTagAndStyle(style.getName(), (parentTableLevel > 0));
        } else {
            tas = new TagAndStyle("p", null);
        }
    } else {
        tas = new TagAndStyle("p", null);
    }
    if (tas.getStyleClass() != null) {
        xhtml.startElement(tas.getTag(), "class", tas.getStyleClass());
    } else {
        xhtml.startElement(tas.getTag());
    }
    if (numbering != null) {
        xhtml.characters(numbering);
    }
    for (int j = 0; j < p.numCharacterRuns(); j++) {
        CharacterRun cr = p.getCharacterRun(j);
        // FIELD_BEGIN_MARK:
        if (cr.text().getBytes(UTF_8)[0] == 0x13) {
            Field field = document.getFields().getFieldByStartOffset(docPart, cr.getStartOffset());
            // 56 is a document link
            if (field != null && (field.getType() == 58 || field.getType() == 56)) {
                // Embedded Object: add a <div
                // class="embedded" id="_X"/> so consumer can see where
                // in the main text each embedded document
                // occurred:
                String id = "_unknown_id";
                //this can return null (TIKA-1956)
                CharacterRun mscr = field.getMarkSeparatorCharacterRun(r);
                if (mscr != null) {
                    id = "_" + mscr.getPicOffset();
                }
                AttributesImpl attributes = new AttributesImpl();
                attributes.addAttribute("", "class", "class", "CDATA", "embedded");
                attributes.addAttribute("", "id", "id", "CDATA", id);
                xhtml.startElement("div", attributes);
                xhtml.endElement("div");
            }
        }
        if (cr.text().equals("")) {
            j += handleSpecialCharacterRuns(p, j, tas.isHeading(), pictures, xhtml);
        } else if (cr.text().startsWith("")) {
            // Floating Picture(s)
            for (int pn = 0; pn < cr.text().length(); pn++) {
                // Assume they're in the order from the unclaimed list...
                Picture picture = pictures.nextUnclaimed();
                // Output
                handlePictureCharacterRun(cr, picture, pictures, xhtml);
            }
        } else if (pictureTable.hasPicture(cr)) {
            // Inline Picture
            Picture picture = pictures.getFor(cr);
            handlePictureCharacterRun(cr, picture, pictures, xhtml);
        } else {
            handleCharacterRun(cr, tas.isHeading(), xhtml);
        }
    }
    // Close any still open style tags
    if (curStrikeThrough) {
        xhtml.endElement("s");
        curStrikeThrough = false;
    }
    if (curItalic) {
        xhtml.endElement("i");
        curItalic = false;
    }
    if (curBold) {
        xhtml.endElement("b");
        curBold = false;
    }
    xhtml.endElement(tas.getTag());
    return 0;
}