package org.nines.cleaner;
import org.nines.ICustomCleaner;
public class LocEphemeraCleaner implements ICustomCleaner {
public String clean(String archiveName, String content) {
if ( archiveName.equals("locEphemera")) {
return stripJunk(content, "<hr>", "Information about SGML version of this document.");
}
return content;
}
/**
* Take all content after the SECOND <hr> tag up to the line about SGML.
* Also skip all content found with { } .
* @param content
* @param startWord
* @param stopWord
* @return
*/
private String stripJunk(String content, String startWord, String stopWord) {
String[] lines = content.split("\n");
StringBuffer finalContent = new StringBuffer();
boolean skip = true;
int startCnt = 0;
boolean startDone = false;
String line = "";
boolean inBrace = false;
for ( int i=0; i<lines.length; i++) {
line = lines[i].trim();
// once we have started accepting content we also need
// to filter out content found between curly braces
// (and the braces themselves)
if ( startDone == true ) {
if ( line.contains("{") && line.contains("}") ) {
while ( true ) {
int p0 = line.indexOf("{");
if ( p0 == -1 ) {
break;
} else {
int p1 = line.indexOf("}");
if (p1 > -1 ) {
line = line.substring(0, p0) + line.substring(p1+1);
} else {
line = line.substring(0, p0);
}
}
}
} else if ( line.contains("{")) {
inBrace = true;
finalContent.append(line).append( line.substring(0, line.indexOf("{")));
continue;
} else if ( line.contains("}")) {
inBrace = false;
line = line.substring(line.indexOf("}"));
}
if ( inBrace ) {
continue;
}
}
if ( line.toLowerCase().contains(startWord) && startDone == false ) {
startCnt++;
if (startCnt == 2) {
skip = !skip;
startDone = true;
}
} else if ( line.contains(stopWord) ) {
skip = !skip;
} else {
if ( skip == false ) {
finalContent.append(line).append("\n");
}
}
}
return finalContent.toString().trim();
}
}