/**
*
*/
package fna.parsing.datacleaner;
import java.io.File;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jdom.Document;
import org.jdom.Element;
import org.jdom.input.SAXBuilder;
import org.jdom.xpath.XPath;
import fna.parsing.ParsingUtil;
/**
* @author hongcui
*
*/
@SuppressWarnings({ "unchecked"})
public abstract class DataCleaner{
protected String legalvalues = null;
protected ArrayList<String> sourceelements = new ArrayList<String>();
protected File outputdir = null;
protected File sourcedir = null;
protected String outputelement = null;
protected ArrayList<String> sourcecontent = new ArrayList<String>();
/**
*
*/
public DataCleaner(String sourcedir, ArrayList<String> sourceElements, String outputElement, String outputdir) {
// TODO Auto-generated constructor stub
this.sourcedir = new File(sourcedir);
this.sourceelements = sourceElements;
this.outputdir = new File(outputdir);
if(! this.outputdir.exists()){
this.outputdir.mkdir();
}
this.outputelement = outputElement;
}
/**
* ***********************************************************************************
* collect content from sourceElements in the files from sourcedir
* save the content text in sourcecontent
*/
protected void collectSourceContent(){
File[] flist = sourcedir.listFiles();
for(int i = 0; i<flist.length; i++){
saveContents(flist[i]);
}
}
private void saveContents(File source){
try{
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(source);
Element root = doc.getRootElement();
Iterator<String> it = sourceelements.iterator();
while(it.hasNext()){
String ename = it.next();
List<Element> elements = XPath.selectNodes(root, "//"+ename);
Iterator<Element> eit = elements.iterator();
while(eit.hasNext()){
Element e = eit.next();
sourcecontent.add(e.getText());
}
}
}catch (Exception e){
e.printStackTrace();
}
}
/*
* **************************************************************************************
* replace the content of each source element with its legal value
* replace the source element name with output element name
*/
protected void cleanFiles(){
File[] flist = sourcedir.listFiles();
for(int i = 0; i<flist.length; i++){
cleanElements(flist[i]);
}
}
private void cleanElements(File file) {
try{
SAXBuilder builder = new SAXBuilder();
Document doc = builder.build(file);
Element root = doc.getRootElement();
root = clean(root);
root.detach();
ParsingUtil.outputXML(root, new File(outputdir, file.getName()), null);
}catch (Exception e){
e.printStackTrace();
}
}
protected abstract Element clean(Element root);
/**
* 1 text may contain multiple legal values
* @param text
* @return a set of legal values
*/
protected ArrayList<String> cleanText(String text) {
ArrayList<String> values = new ArrayList<String>();
//text = text.toLowerCase();
System.out.println();
System.out.print(text+"=========>");
text = text.replaceAll("\\.", "PERIOD").replaceAll("\\p{Punct}", "").replaceAll("PERIOD", ".");
Pattern p = Pattern.compile(".*?\\b("+this.legalvalues+")( |$|\\W)(.*)");
Matcher m = p.matcher(text);
while(m.matches()){
values.add(m.group(1));
System.out.print(":"+standardize(m.group(1)));
m = p.matcher(m.group(3));
}
return values;
}
protected String standardize(String s){
return s;
}
/*
* ***********************************************************************************
*/
protected abstract void collectLegalValues();
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
}
}