/*
* Copyright (C) 2007-2011 Geometer Plus <contact@geometerplus.com>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA.
*/
package org.geometerplus.fbreader.formats.html;
import java.util.HashMap;
import java.io.*;
import java.nio.charset.*;
import org.geometerplus.fbreader.bookmodel.BookModel;
import org.geometerplus.fbreader.bookmodel.BookReader;
import org.geometerplus.fbreader.bookmodel.FBTextKind;
import org.geometerplus.zlibrary.core.html.*;
import org.geometerplus.zlibrary.core.util.ZLArrayUtils;
import org.geometerplus.zlibrary.text.model.ZLTextParagraph;
import org.geometerplus.zlibrary.core.xml.ZLXMLProcessor;
import org.geometerplus.fbreader.formats.xhtml.XHTMLReader;
public class HtmlReader extends BookReader implements ZLHtmlReader {
private final byte[] myStyleTable = new byte[HtmlTag.TAG_NUMBER];
{
myStyleTable[HtmlTag.H1] = FBTextKind.H1;
myStyleTable[HtmlTag.H2] = FBTextKind.H2;
myStyleTable[HtmlTag.H3] = FBTextKind.H3;
myStyleTable[HtmlTag.H4] = FBTextKind.H4;
myStyleTable[HtmlTag.H5] = FBTextKind.H5;
myStyleTable[HtmlTag.H6] = FBTextKind.H6;
myStyleTable[HtmlTag.B] = FBTextKind.BOLD;
myStyleTable[HtmlTag.SUB] = FBTextKind.SUB;
myStyleTable[HtmlTag.SUP] = FBTextKind.SUP;
myStyleTable[HtmlTag.S] = FBTextKind.STRIKETHROUGH;
myStyleTable[HtmlTag.PRE] = FBTextKind.PREFORMATTED;
myStyleTable[HtmlTag.EM] = FBTextKind.EMPHASIS;
myStyleTable[HtmlTag.DFN] = FBTextKind.DEFINITION;
myStyleTable[HtmlTag.CITE] = FBTextKind.CITE;
myStyleTable[HtmlTag.CODE] = FBTextKind.CODE;
myStyleTable[HtmlTag.STRONG] = FBTextKind.STRONG;
myStyleTable[HtmlTag.I] = FBTextKind.ITALIC;
}
protected final CharsetDecoder myAttributeDecoder;
private boolean preflag= true;//如果有pre 开始就要在 body 前关闭它
private boolean myInsideTitle = false;
private boolean mySectionStarted = false;
private byte myHyperlinkType;
private final char[] SPACE = { ' ' };
private String myHrefAttribute = "href";
private boolean myOrderedListIsStarted = false;
//private boolean myUnorderedListIsStarted = false;
private int myOLCounter = 0;
private byte[] myControls = new byte[10];
private byte myControlsNumber = 0;
public HtmlReader(BookModel model) throws UnsupportedEncodingException {
super(model);
try {
//String encoding = model.Book.getEncoding();
myAttributeDecoder = createDecoder();
setByteDecoder(createDecoder());
} catch (UnsupportedCharsetException e) {
throw new UnsupportedEncodingException(e.getMessage());
}
}
protected final CharsetDecoder createDecoder() throws UnsupportedEncodingException {
return Charset.forName(Model.Book.getEncoding()).newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
}
public boolean readBook() throws IOException {
System.out.println("----hym--readbook html-");
return ZLHtmlProcessor.read(this, getInputStreamReader());
//hym 修改 getInputStream 改成了 getInputStreamReader
}
//hym 修改 getInputStream 改成了 getInputStreamReader 会影响继承他的子类,都要检查一下
public InputStreamReader getInputStreamReader() throws IOException {
System.out.println("---12---"+Model.Book.getEncoding());
return new InputStreamReader(Model.Book.File.getInputStream( ),Model.Book.getEncoding());
}
public void startDocumentHandler() {
setMainTextModel();//hym 添加,不然 mobipockethtmlbookreader 会出错。
}
public void endDocumentHandler() {
unsetCurrentTextModel();
}
public boolean isEnd(String tmpstr){
// if(tmpstr==null||tmpstr.trim().equals("")){
// return true;
// }else{
char laststr = tmpstr.charAt(tmpstr.length()-1);
if(laststr=='。'){
return true;
}
if(laststr=='”'){
return true;
}
if(laststr=='!'){
return true;
}
if(laststr=='?'){
return true;
}
if(laststr=='.'){
return true;
}
if(laststr=='!'){
return true;
}
if(laststr=='?'){
return true;
}
if(laststr=='」'){
return true;
}
if(laststr==':'){
return true;
}
if(laststr==':'){
return true;
}
if(laststr=='’'){
return true;
}
if(laststr=='\''){
return true;
}
if(laststr=='"'){
return true;
}
if(laststr==')'){
return true;
}
if(laststr==')'){
return true;
}
return false;
// }
}
public void charDataHandler(char[] data, int start, int length) {
if(Model.Book.getZnFlag()){
//智能处理文本,速度慢。
String str= new String(data,start,length);
String[] strarr=str.split("\n");
// 处理文本
String ttstr="";
for(int i=0;i<strarr.length;i++){
String ttmpstr=strarr[i].trim();
ttmpstr=ttmpstr.replaceAll(" ", "");
ttstr+=ttmpstr;
if(!ttstr.equals("")){
if(ttmpstr.length()>28){//要判断 最后是不是标点符号,来确定是否一个段落,还是txt为了看的方便自己的做的换行。
if(isEnd(ttstr)){
ttstr=ttstr+"\r\n";
addData(ttstr.toCharArray(), 0,ttstr.length(),false);
endParagraph();
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
ttstr="";
}else{
if(i==strarr.length-1){//到这一段的最后一行了
ttstr=ttstr+"\r\n";
addData(ttstr.toCharArray(), 0,ttstr.length(),false);
endParagraph();
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
ttstr="";
}
}
}else{//字数 少 就直接是段落
ttstr=ttstr+"\r\n";
addData(ttstr.toCharArray(), 0,ttstr.length(),false);
endParagraph();
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
ttstr="";
}
}
}
}else{
int count=length;
int ss=0;
for (int i = 0; i < count; i++) {
if (data[i+start] == '\n') {
if (ss != i) {
addData(data, start+ss, i - ss,false);
endParagraph();
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
}
ss = i + 1;
} else if (data[i+start] == '\r') {
continue;
} else if (data[i+start] == ' ' || data[i+start] == '\t') {
data[i+start] = ' ';
// if(i+start-1>=0&&data[i+start-1] == ' '){
// data[i+start-1] = ' ';
// }
} else {
}
}
if (ss != count) {
addData(data, start+ss, count - ss,false);
}
}
// addData(data, start, length,false);
// System.out.println("hym--html-"+length+":"+new String(data, start, length));
}
private HashMap<String,char[]> myEntityMap;
public void entityDataHandler(String entity) {
if (myEntityMap == null) {
myEntityMap = new HashMap<String,char[]>(ZLXMLProcessor.getEntityMap(XHTMLReader.xhtmlDTDs()));
}
char[] data = myEntityMap.get(entity);
if (data == null) {
if ((entity.length() > 0) && (entity.charAt(0) == '#')) {
try {
int number;
if (entity.charAt(1) == 'x') {
number = Integer.parseInt(entity.substring(2), 16);
} else {
number = Integer.parseInt(entity.substring(1));
}
data = new char[] { (char)number };
} catch (NumberFormatException e) {
}
}
if (data == null) {
data = new char[0];
}
myEntityMap.put(entity, data);
}
addData(data);
System.out.println("html:"+entity+"|"+new String(data));
}
private void openControl(byte control) {
addControl(control, true);
if (myControlsNumber == myControls.length) {
myControls = ZLArrayUtils.createCopy(myControls, myControlsNumber, 2 * myControlsNumber);
}
myControls[myControlsNumber++] = control;
}
private void closeControl(byte control) {
for (int i = 0; i < myControlsNumber; i++) {
addControl(myControls[i], false);
}
boolean flag = false;
int removedControl = myControlsNumber;
for (int i = 0; i < myControlsNumber; i++) {
if (!flag && (myControls[i] == control)) {
flag = true;
removedControl = i;
continue;
}
addControl(myControls[i], true);
}
if (removedControl == myControlsNumber) {
return;
}
--myControlsNumber;
for (int i = removedControl; i < myControlsNumber; i++) {
myControls[i] = myControls[i + 1];
}
}
private void startNewParagraph() {
endParagraph();
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
}
public final void endElementHandler(String tagName) {
// System.out.println("hym---html:end->"+tagName);
if(HtmlTag.getTagByName(tagName)==HtmlTag.PRE)
this.preflag=true;
if(HtmlTag.getTagByName(tagName)==HtmlTag.BODY&&!preflag){
endElementHandler(HtmlTag.PRE);
// System.out.println("hym---html:add end->"+tagName);
}
endElementHandler(HtmlTag.getTagByName(tagName));
}
public void endElementHandler(byte tag) {
switch (tag) {
case HtmlTag.SCRIPT:
case HtmlTag.SELECT:
case HtmlTag.STYLE:
case HtmlTag.P:
startNewParagraph();
break;
case HtmlTag.H1:
case HtmlTag.H2:
case HtmlTag.H3:
case HtmlTag.H4:
case HtmlTag.H5:
case HtmlTag.H6:
case HtmlTag.PRE:
closeControl(myStyleTable[tag]);
startNewParagraph();
break;
case HtmlTag.A:
closeControl(myHyperlinkType);
break;
case HtmlTag.BODY:
popAllKind();//hym
break;
case HtmlTag.HTML:
//unsetCurrentTextModel();
break;
case HtmlTag.B:
case HtmlTag.S:
case HtmlTag.SUB:
case HtmlTag.SUP:
case HtmlTag.EM:
case HtmlTag.DFN:
case HtmlTag.CITE:
case HtmlTag.CODE:
case HtmlTag.STRONG:
case HtmlTag.I:
closeControl(myStyleTable[tag]);
break;
case HtmlTag.OL:
myOrderedListIsStarted = false;
myOLCounter = 0;
break;
case HtmlTag.UL:
//myUnorderedListIsStarted = false;
break;
default:
break;
}
}
public final void startElementHandler(String tagName, int offset, ZLHtmlAttributeMap attributes) {
// System.out.println("hym---html:start->"+tagName);
if(HtmlTag.getTagByName(tagName)==HtmlTag.PRE)
this.preflag=false;
startElementHandler(HtmlTag.getTagByName(tagName), offset, attributes);
}
public void startElementHandler(byte tag, int offset, ZLHtmlAttributeMap attributes) {
switch (tag) {
case HtmlTag.HTML:
break;
case HtmlTag.BODY:
setMainTextModel();
pushOneKind(FBTextKind.REGULAR);//hym
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
break;
case HtmlTag.P:
if (mySectionStarted) {
mySectionStarted = false;
} else if (myInsideTitle) {
addContentsData(SPACE);
}
beginParagraph(ZLTextParagraph.Kind.TEXT_PARAGRAPH);
break;
case HtmlTag.A:{
String ref = attributes.getStringValue(myHrefAttribute, myAttributeDecoder);
if ((ref != null) && (ref.length() != 0)) {
if (ref.charAt(0) == '#') {
myHyperlinkType = FBTextKind.FOOTNOTE;
ref = ref.substring(1);
} else if (ref.charAt(0) == '&') {
myHyperlinkType = FBTextKind.INTERNAL_HYPERLINK;
ref = ref.substring(1);
}
else {
myHyperlinkType = FBTextKind.EXTERNAL_HYPERLINK;
}
addHyperlinkControl(myHyperlinkType, ref);
myControls[myControlsNumber] = myHyperlinkType;
myControlsNumber++;
}
break;
}
case HtmlTag.IMG: {
/*
String ref = attributes.getStringValue(mySrcAttribute, myAttributeDecoder);
if ((ref != null) && (ref.length() != 0)) {
addImageReference(ref, (short)0);
String filePath = ref;
if (!":\\".equals(ref.substring(1, 3))) {
filePath = Model.Book.File.getPath();
filePath = filePath.substring(0, filePath.lastIndexOf('\\') + 1) + ref;
}
addImage(ref, new ZLFileImage(MimeTypes.MIME_IMAGE_AUTO, ZLFile.createFileByPath(filePath)));
}
*/
break;
}
case HtmlTag.B:
case HtmlTag.S:
case HtmlTag.SUB:
case HtmlTag.SUP:
case HtmlTag.PRE:
case HtmlTag.STRONG:
case HtmlTag.CODE:
case HtmlTag.EM:
case HtmlTag.CITE:
case HtmlTag.DFN:
case HtmlTag.I:
openControl(myStyleTable[tag]);
break;
case HtmlTag.H1:
case HtmlTag.H2:
case HtmlTag.H3:
case HtmlTag.H4:
case HtmlTag.H5:
case HtmlTag.H6:
startNewParagraph();
openControl(myStyleTable[tag]);
break;
case HtmlTag.OL:
myOrderedListIsStarted = true;
break;
case HtmlTag.UL:
//myUnorderedListIsStarted = true;
break;
case HtmlTag.LI:
startNewParagraph();
if (myOrderedListIsStarted) {
char[] number = (new Integer(++myOLCounter)).toString().toCharArray();
addData(number);
addData(new char[] {'.', ' '});
} else {
addData(new char[] {'*', ' '});
}
break;
case HtmlTag.SCRIPT:
case HtmlTag.SELECT:
case HtmlTag.STYLE:
endParagraph();
break;
case HtmlTag.TR:
case HtmlTag.BR:
startNewParagraph();
break;
default:
break;
}
}
}