/*
* Copyright (C) 2009 lichtflut Forschungs- und Entwicklungsgesellschaft mbH
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package de.lichtflut.infra.html;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import org.jdom.Document;
import de.lichtflut.infra.html.provider.XhtmlProvider;
/**
* <p>
* [DESCRIPTION]
* </p>
*
* <p>
* Created 07.07.2009
* </p>
*
* @author Nils Bleisch
*/
public class MTIExtractor{
//Members
//Fields
private HtmlDocument document;
//Constructor
public MTIExtractor(Document document){
generateInformationStructure(document);
}//end of Constructor
private boolean generateInformationStructure(Document rawDoc) {
this.document = new XhtmlProvider().create(rawDoc);
generateTables(document.getBody());
return true;
}//End of Method generateInformationStructure()
//Generate the tableStructure
private void generateTables(HtmlElement node){
List<HtmlElement> children = node.getChildren();
//if there are no children, get out of here!
if(children==null || children.size()==0) return;
if(node instanceof HtmlTable){
//its a table, lets make a type-cast
HtmlTable table = (HtmlTable) node;
//initializing table
HtmlFilter rowFilter = new HtmlFilter();
rowFilter.addFilterRule(WellKnownElement.TR);
List<HtmlElement> rows = new LinkedList<HtmlElement>();
this.getElements(rowFilter,rows,table,1);
HtmlFilter columnFilter = new HtmlFilter();
columnFilter.addFilterRule(WellKnownElement.TD);
columnFilter.addFilterRule(WellKnownElement.TH);
table.table = new HtmlElement[rows.size()][];
//fill table
//iterate over rows
for (int i = 0; i < rows.size(); i++) {
List <HtmlElement> line = new LinkedList<HtmlElement>();
this.getElements(columnFilter,line,rows.get(i),1);
table.table[i] = new HtmlElement[line.size()];
for (int j = 0; j < table.table[i].length; j++) {
HtmlElement cell = line.get(j);
//check for row or colspan
Map <String,String>attribute = cell.getAttributes();
if(attribute.containsKey("rowspan")&&attribute.get("rowspan")!=null){
int rowspan = Integer.parseInt(attribute.get("rowspan"));
for(int rowCnt=1;rowCnt<=(rowspan-1)&&i+rowCnt<rows.size();rowCnt++){
rows.get(i+rowCnt).getChildren().add(j,cell);
cell.getAttributes().remove("rowspan");
}//end of for
}else if(attribute.containsKey("colspan")){
//int colspan = Integer.parseInt(attribute.get("colspan"));
}
//-------------------------------
table.table[i][j] = cell;
}//End of inner for
}//End of outer for
}//end if
for (HtmlElement child : children) {
generateTables(child); //Be recursive!
}//end of for
}//end of Method generateTables()
//setters and getters
public HtmlDocument getDocument() {
return document;
}//End of Method getDocument()
public void getAllElements(List<HtmlElement> htmlElements){
HtmlFilter filter = new HtmlFilter();
filter.invert();
this.getElements(filter, htmlElements);
}//End of Method getElements()
//recursive
public void getElements(HtmlFilter filter, List<HtmlElement> htmlElements){
this.getElements(filter, htmlElements, document.getHead());
this.getElements(filter, htmlElements, document.getBody());
}//End of Method getElements()
public void getElements(HtmlFilter filter,
List<HtmlElement> htmlElements,HtmlElement node,int depth){
this.getElements(filter,htmlElements,node,depth,true);
}
public void getElements(HtmlFilter filter,
List<HtmlElement> htmlElements,HtmlElement node){
this.getElements(filter,htmlElements,node,0,false);
}//End of Method getElements
private void getElements(HtmlFilter filter,
List<HtmlElement> htmlElements,HtmlElement node, int depth,boolean depthFlag){
if(htmlElements==null||filter==null||(depthFlag&&depth<0)) return;
if(node==null) return;
if(WellKnownElement.isValue((node.getName())))
if(filter.isFilterRule(WellKnownElement.forValue(node.getName()))){
htmlElements.add(node);
}
for(HtmlElement child:node.getChildren()){
getElements(filter,htmlElements,child,(depth-1),depthFlag);
}
}//End of Method getElements
public String getPlainText(boolean normalized){
return getPlainText(document.getHead(),normalized) + getPlainText(document.getBody(),normalized);
}//End of Method getPlainText
public String getPlainText(HtmlElement node, boolean normalized){
return getPlainText(node,new HtmlFilter(),normalized);
}//End of Method getPlainText();
public String getPlainText(HtmlElement node,HtmlFilter ignore,boolean normalized){
HtmlFilter filter = new HtmlFilter();
filter.addFilterRule(WellKnownElement.TEXT);
List<HtmlElement> list = new LinkedList <HtmlElement>() ;
getElements(filter,list,node);
StringBuffer buf = new StringBuffer();
for(HtmlElement elem: list){
if(elem.getParent()!=null&& WellKnownElement.isValue(elem.getParent().getName())){
if(!ignore.isFilterRule(WellKnownElement.forValue(elem.getParent().getName())))
if(normalized) buf.append(((HtmlText)elem).getNormalizedText());
else buf.append(elem.getText());
}else buf.append(elem.getText());
}//end of for
return buf.toString();
}//end of Method getPlainText()
public HtmlTable[] getTables(HtmlElement node){
List<HtmlElement> list = new LinkedList <HtmlElement>();
HtmlFilter filter = new HtmlFilter();
filter.addFilterRule(WellKnownElement.TABLE);
getElements(filter,list,node);
Object[] elements = list.toArray();
HtmlTable[] tables = new HtmlTable[elements.length];
for (int i = 0; i < tables.length; i++) {
tables[i] = (HtmlTable) elements[i];
}
return tables;
}//End of Method getTables()
public HtmlTable[] getTables(){
return(getTables(document.getBody()));
}//End of Method getTables()
public void setDocument(Document document) {
generateInformationStructure(document);
}//End of Method setDocument()
}//End of class MTIExtractor