/*
* Tanaguru - Automated webpage assessment
* Copyright (C) 2008-2015 Tanaguru.org
*
* This file is part of Tanaguru.
*
* Tanaguru is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
* Contact us by mail: tanaguru AT tanaguru DOT org
*/
package org.tanaguru.contentadapter.util;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* @author jkowalczyk
*/
public abstract class DocumentCaseInsensitiveAdapter {
private static final char CARRIAGE_RETURN_CHAR_1 = '\n';
private static final char CARRIAGE_RETURN_CHAR_2 = '\r';
private static final char EMPTY_CHAR = ' ';
/**
*
* @param cleanHtml
* @return
*/
public static String removeLowerCaseTags(String cleanHtml) {
StringBuilder newCleanHtml = new StringBuilder();
int strPtr=0;
int tmpPtr=0;
while (strPtr != cleanHtml.length()){
if (cleanHtml.charAt(strPtr) == '<') {
if (cleanHtml.charAt(strPtr+1) == '!' ||
cleanHtml.charAt(strPtr+1) == '?' || //To ignore the case of <!doctype and <?xml
cleanHtml.charAt(strPtr+1) == '_') {
newCleanHtml.append(cleanHtml.charAt(strPtr));
strPtr++;
} else if (cleanHtml.charAt(strPtr+1) == '/') {
tmpPtr = cleanHtml.indexOf('>', strPtr);
newCleanHtml.append('<');
newCleanHtml.append('/');
newCleanHtml.append(cleanHtml.
substring(strPtr+2, tmpPtr).toUpperCase());
strPtr = tmpPtr;
} else {
if (cleanHtml.indexOf(' ', strPtr) <
cleanHtml.indexOf('>', strPtr) &&
cleanHtml.indexOf(' ', strPtr) > 0) {
// case of self-closing tag
tmpPtr = cleanHtml.indexOf(' ', strPtr);
} else {
// case of classical opening tag
tmpPtr = cleanHtml.indexOf('>', strPtr);
}
newCleanHtml.append('<');
newCleanHtml.append(cleanHtml.
substring(strPtr+1, tmpPtr).toUpperCase());
strPtr = tmpPtr;
}
} else {
newCleanHtml.append(cleanHtml.charAt(strPtr));
strPtr++;
}
}
String result = newCleanHtml.toString();
return result;
}
/**
*
* @param cleanHtml
* @return
*/
public static String removeUpperCaseTags(String cleanHtml) {
StringBuilder newCleanHtml = new StringBuilder();
int strPtr=0;
int tmpPtr=0;
while (strPtr != cleanHtml.length()){
if (cleanHtml.charAt(strPtr) == '<') {
if (cleanHtml.charAt(strPtr+1) == '!') { //To ignore the case of <!doctype
newCleanHtml.append(cleanHtml.charAt(strPtr));
strPtr++;
} else if (cleanHtml.charAt(strPtr+1) == '/') {
tmpPtr = cleanHtml.indexOf('>', strPtr);
newCleanHtml.append('<');
newCleanHtml.append('/');
newCleanHtml.append(cleanHtml.
substring(strPtr+2, tmpPtr).toLowerCase());
strPtr = tmpPtr;
} else {
if (cleanHtml.indexOf(' ', strPtr) <
cleanHtml.indexOf('>', strPtr)) {
// case of self-closing tag
tmpPtr = cleanHtml.indexOf(' ', strPtr);
} else {
// case of classical opening tag
tmpPtr = cleanHtml.indexOf('>', strPtr);
}
newCleanHtml.append('<');
newCleanHtml.append(cleanHtml.
substring(strPtr+1, tmpPtr).toLowerCase());
strPtr = tmpPtr;
}
} else {
newCleanHtml.append(cleanHtml.charAt(strPtr));
strPtr++;
}
}
return newCleanHtml.toString();
}
public static String removeDoctypeDeclaration(String html){
int doctypeBeginTagPtr = html.indexOf("<!DOCTYPE");
if (doctypeBeginTagPtr == -1 ) {
doctypeBeginTagPtr = html.indexOf("<!doctype");
if (doctypeBeginTagPtr == -1 ) {
return html;
}
}
int doctypeEndTagPtr = html.indexOf('>', doctypeBeginTagPtr);
StringBuilder cleanHtmlWithoutDoctype = new StringBuilder();
if (doctypeBeginTagPtr > 0) {
cleanHtmlWithoutDoctype.append(html, 0, doctypeBeginTagPtr);
}
cleanHtmlWithoutDoctype.append(
html, doctypeEndTagPtr+1, html.length()-1);
return cleanHtmlWithoutDoctype.toString();
}
/**
* This method extracts the doctype declaration from the html source code
* @param html
* @return
*/
public static String extractDoctypeDeclaration(String html){
int doctypeBeginTagPtr = html.indexOf("<!DOCTYPE");
if (doctypeBeginTagPtr == -1 ) {
doctypeBeginTagPtr = html.indexOf("<!doctype");
if (doctypeBeginTagPtr == -1 ) {
return "";
}
}
int doctypeEndTagPtr = html.indexOf('>', doctypeBeginTagPtr);
String doctype = html.substring(doctypeBeginTagPtr, doctypeEndTagPtr+1).
trim().
replace(CARRIAGE_RETURN_CHAR_1,EMPTY_CHAR).
replace(CARRIAGE_RETURN_CHAR_2, EMPTY_CHAR);
Pattern pattern = Pattern.compile("\\s+");
Matcher matcher = pattern.matcher(doctype);
doctype = matcher.replaceAll(" ");
return doctype;
}
}