package com.zilbo.flamingSailor.TE.model;
import com.zilbo.flamingSailor.TE.model.TextType.TextType;
import com.zilbo.flamingSailor.TE.model.TextType.Unknown;
import org.apache.log4j.Logger;
import java.awt.geom.Rectangle2D;
import java.util.ArrayList;
import java.util.List;
/*
* Copyright 2012 Zilbo.com
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public final class TextPiece extends Component {
private static final Logger logger = Logger.getLogger(TextPiece.class);
TextType type;
double m_xScale; //the X-scale of the text piece
double m_yScale; //the Y-scale of the text piece
// boolean m_superScriptBeginning = false; //whether this text piece begins with a superscript
// boolean m_sparseLine = true; //whether this text piece is a sparse line or not
// boolean m_onlyPiece_PhysicalLine = true; //whether this text piece is the only piece in the physical line in document. We need this attribute to label table caption lines....
double m_width; //the width of this text piece
double m_height; //the height of this text piece
double m_widthOfSpace; //the width space in this text piece
double m_wordSpacing; //the space between words in this text piece
double m_fontSize; //the font size of this text piece
String m_fontName; //the font name of this text piece.
//PDFontDescriptor fontDescriptor;
String m_text; //the text content of this text piece
// boolean hadZeroWidth;
long[] histogram;
public TextPiece(long id) {
super(id);
m_text = "";
// hadZeroWidth = false;
this.type = new Unknown();
histogram= new long[HIST_OTHER+1];
for ( int i = 0;i<histogram.length;i++) {
histogram[i]=0L;
}
}
/*
public PDFontDescriptor getFontDescriptor() {
return fontDescriptor;
}
*/
/*
public void setFontDescriptor(PDFontDescriptor fontDescriptor) {
this.fontDescriptor = fontDescriptor;
}
*/
/**
* Gets the font name of a text piece
*
* @return the font name
*/
public String getFontName() {
return m_fontName;
}
/**
* Gets the font size of a text piece
*
* @return the font size
*/
public double getFontSize() {
return m_fontSize;
}
@Override
/**
* Gets the content of a text piece
*
* @return the text itself
*/
public String getText() {
return m_text;
}
/**
* Gets the width of a text piece
*
* @return the width of a text piece
*/
public double getWidth() {
return m_width;
}
/**
* Gets the height of a text piece
*
* @return the height of a text piece
*/
public double getHeight() {
return m_height;
}
/**
* Gets the space width between characters in a text piece
*
* @return the space width
*/
public double getWidthOfSpace() {
return m_widthOfSpace;
}
/**
* Gets the space size between words
*
* @return the space size between words
*/
public double getWordSpacing() {
return m_wordSpacing;
}
/**
* Gets the X scale of a text piece
*
* @return the X-Scale
*/
public double getXScale() {
return m_xScale;
}
/**
* Gets the Y scale of a text piece
*
* @return the Y-Scale
*/
public double getYScale() {
return m_yScale;
}
/**
* Judges whether the first character of a text piece starting is a superscript
*
* @return the superScriptBeginning
*/
/*
public boolean isSuperScriptBeginning() {
return m_superScriptBeginning;
}
*/
/**
* Judges whether a text line is a sparse line or not
*
* @return whether it is a sparse line or not
*/
/*
public boolean isSparseLine() {
return m_sparseLine;
}
*/
/**
* Sets the font name of a text piece
*
* @param fontName the font name to be set
*/
public void setFontName(String fontName) {
m_fontName = fontName;
}
/**
* Sets the font size of a text piece
*
* @param fontSize the font size to set
*/
public void setFontSize(float fontSize) {
m_fontSize = fontSize;
}
/**
* Sets a text piece starting with a superscript
*
* @param superScriptBeginning the superScriptBeginning to set
*/
/*
public void setSuperScriptBeginning(boolean superScriptBeginning) {
m_superScriptBeginning = superScriptBeginning;
}
*/
/**
* Sets a text piece as a sparse line
*
* @param sparseLine a boolean value (whether it is a sparse line or not)
*/
/*
public void setSparseLine(boolean sparseLine) {
m_sparseLine = sparseLine;
}
*/
/**
* Sets the text content of a text piece
*
* @param text the text to set
*/
public void setText(String text) {
//m_text = new String(ModifiedASCIIFoldingFilter.foldToASCII(text.toCharArray(), text.length()));
// m_text = text;
// the following is to deal with non-breaking spaces (Char(160)).
m_text = text.replace((char) (160), ' ');
}
/**
* Sets the width of a text piece
*
* @param width the width to set
*/
public void setWidth(float width) {
m_width = width;
}
/**
* Sets the height of a text piece
*
* @param height the height to set
*/
public void setHeight(float height) {
m_height = height;
}
/**
* Sets the width of the space between characters in a text piece
*
* @param widthOfSpace the widthOfSpace to set
*/
public void setWidthOfSpace(float widthOfSpace) {
m_widthOfSpace = widthOfSpace;
}
/**
* Sets the space between words
*
* @param wordSpacing the wordSpacing to set
*/
public void setWordSpacing(float wordSpacing) {
m_wordSpacing = wordSpacing;
}
/**
* Sets the XScale of a text piece
*
* @param scale the xScale of a text piece to set
*/
public void setXScale(float scale) {
m_xScale = scale;
}
/**
* Sets the YScale of a text piece
*
* @param scale the yScale to set
*/
public void setYScale(float scale) {
m_yScale = scale;
}
/**
* For some PDF files, PDFBOX extracted the texts as HTML codes.
* We have to detect such files and convert the HTML numbers back to real text for later procession
*
* @param text the string to be checked
* @return the boolean result after checking
*/
/*
public boolean isHTMLCode(String text) {
return text.startsWith("c");
}
*/
/**
* Collects all the information of a text piece and stores them in a string. The string will be printed into a middle-result file locally for testing and debugging purpose
*
* @return the generated information string for the text piece
*/
@Override
public String toString() {
String format = "Type=[%s] Text=[%s] ";
return getRectangleDebug() + " " + String.format(
format,
this.getType(),
this.getText());
}
/**
* Is this other piece of text next to (left-right not top-down) the current piece of text
*
* @param t the other piece
* @return true if it is
*/
public boolean isNextTo(TextPiece t) {
if (!this.onSameLine(t)) {
return false;
}
// note sometimes two characters have same 'X' position and a width of zero ;-(
// double dist = Math.abs(this.geom.getMinX() - t.geom.getMinX()); /* this one really shouldn't occur */
double dist2 = Math.abs(this.getGeom().getMaxX() - t.getGeom().getMinX());
// sometimes getWidthOfSpace is zero. in this case we explicitly test for it
double distAllowed = Math.max(1.0, this.getWidthOfSpace() - 0.15);
if (dist2 < (distAllowed)) {
return true;
}
return false;
}
/**
* Is this other piece of text next to (left-right not top-down) the current piece of text and the same font
*
* @param t the other piece
* @return true if it is
*/
public boolean isNextToX(TextPiece t) {
// note sometimes two characters have same 'X' position and a width of zero ;-(
return (this.isNextTo(t) &&
(t.getFontSize() == this.getFontSize()) &&
(t.getXScale() == this.getXScale()));
}
/**
* append the passed 'textpiece' to the current one
*
* @param bit the piece to append
*/
public void appendX(TextPiece bit) {
if (Math.abs(bit.geom.getMinX() - this.geom.getMaxX()) > this.getXScale() * 0.2) {
this.setText(this.getText() + " " + bit.getText());
} else {
this.setText(this.getText() + bit.getText());
}
// this.geom = new Rectangle2D.Double(geom.getP1(), new Point2D.Double(bit.geom.getX2(), geom.getY2()));
geom = geom.createUnion(bit.getGeom());
// this.setEndX(bit.getEndX());
/*
if (bit.getWidth() == 0) {
this.hadZeroWidth = true;
}
*/
}
public void setGeom(double x, double y, double w, double h) {
geom = new Rectangle2D.Double(x, y, w, h);
}
/*
public void setGeom(Point2D p1, Point2D p2) {
geom = new Rectangle2D.Double(p1.getX(), p1.getY(), p2.getX(), p2.getY());
}
*/
/*
public boolean hadZeroWidth() {
return hadZeroWidth;
}
*/
public Long getID() {
return id;
}
public TextType getType() {
return this.type;
}
@Override
public List<Component> getChildren() {
return new ArrayList<>();
}
public String getCategorizedText() {
return type.getCategorizedText();
}
public TextType getCategory() {
return type;
}
public void setCategory(TextType typer) {
type = typer;
}
public void categorize() {
for ( int i=0; i<histogram.length;i++) {
histogram[i]=0L;
}
type = TextType.categorize(this);
for (Character c : m_text.toCharArray()) {
if (Character.isDigit(c)) {
histogram[HIST_DIGIT]++;
continue;
}
if (Character.isLowerCase(c)) {
histogram[HIST_LOWER]++;
continue;
}
if (Character.isSpaceChar(c)) {
histogram[HIST_SPACE]++;
continue;
}
if (Character.isUpperCase(c)) {
histogram[HIST_UPPER]++;
continue;
}
if ('-' == c
|| '.' == c
|| ':' == c
|| '\u00B7' == c
|| '\u0387' == c
|| '-' == c
|| '\u06DD' == c
|| '\u06DE' == c) {
histogram[HIST_PUNCT]++;
continue;
}
histogram[HIST_OTHER]++;
}
}
public boolean isTOCPart(TextPiece bit) {
return (m_text.endsWith(". . ") || (m_text.endsWith(". .") || (m_text.endsWith("... ")) || (m_text.endsWith("....")))
&& !(bit.getText().equals(".") || bit.getText().equals(" ")));
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TextPiece textPiece = (TextPiece) o;
if (!geom.equals(textPiece.geom)) return false;
if (!m_text.equals(textPiece.m_text)) return false;
// if (!type.equals(textPiece.type)) return false;
return true;
}
@Override
public int hashCode() {
int result = geom.hashCode();
// result = 31 * result + type.hashCode();
result = 31 * result + m_text.hashCode();
return result;
}
public boolean isEmpty() {
if (getType().isUnknown()) {
categorize();
}
return getType().isEmpty();
}
@Override
public double density() {
assert false;
return -1;
}
@Override
public long[] getHistogram() {
return histogram;
}
}