/**
* @author Jonathan Shuman
* @Purpose For Indexing WikiQuotes.
* A helper class which represents a single document in an index
*/
package edu.uncc.cs.watsonsim.scripts;
import java.util.Scanner;
/**
* @author ShumanLaptop
*
*/
public class WikiquoteQuote {
private String text;
private String title;
private boolean trash = false;
private int docNumber;
/**
* @return the text
*/
public String getText() {
return text;
}
/**
* @param text the text to set
*/
public void setText(String text) {
this.text = text;
}
/**
* @return the title
*/
public String getTitle() {
return title;
}
/**
* @param title the title to set
*/
public void setTitle(String title) {
//Remove trailing and leading spaces.
//Also will remove everything except the title itself (ie. next regex is useless)
this.title = title.replaceAll("(\\s*<title[^<>]*>\\s*)|(\\s*</title>)", "");
//Remove everything except the title itself
//ie remove <title xxx> and </title>
//this.title = title.replaceAll("<[^<>]*>", "");
//Get rid of all &xxx; such as "
this.title = this.title.replaceAll("&(.*?);", "");
//Get rid of metadata titles
if (this.title.contains("Wikiquote:")){
this.trash = true;
}
//Get rid of metadata titles
if (this.title.contains("Contents:")){
this.trash = true;
}
//Get rid of metadata titles
if (this.title.contains("Template:")){
this.trash = true;
}
}
/**
* @return the trash
*/
public boolean isTrash() {
return trash;
}
/**
* @param trash the trash to set
*/
public void setTrash(boolean trash) {
this.trash = trash;
}
@Override
public String toString(){
if(text.isEmpty())
return "";
cleanupText();
if(docNumber == 0 || trash)
return "";
Scanner reader = new Scanner(text);
String output = getHeader(docNumber);
output = output + "<title>" + title + "</title>\n";
while(reader.hasNextLine()){
String temp = reader.nextLine();
if(temp != ""){
output = output + "\n" + temp;
}
}
reader.close();
output = output + "\n"
+ getFooter();
return output;
}
private void cleanupText() {
//The <text has an extra tag (xml:space="preserve") on most entries
//Replace with regex: \<text[^<>]*\> with <text> which is cleaner
this.text = this.text.replaceFirst("\\<text[^<>]*\\>", "<text>");
//There are a lot of redirects, kill those as well(\[\[)|(\]\])|('')
if(this.text.contains("#REDIRECT") || this.text.contains("#redirect")){
trash = true;
return;
}
// Get rid of [[xx:xxxxx]] entries: regex: \[\[..:[^<>]*\]\]
text = text.replaceAll("\\[\\[..:[^<>]*\\]\\]", "");
// Get rid of [[, ]], and '' in entries
text = text.replaceAll("(\\[\\[)|(\\]\\])|('')", "");
//Get rid of all {{meta-tags}} regex: {{[^<>]*}}
text = text.replaceAll("\\{\\{[^<>]*\\}\\}", "");
//Get rid of all [[ and ]] but not the content in between
text = text.replaceAll("(\\[\\[)|(\\]\\])", "");
//Get rid of all < and ;br:
text = text.replaceAll("<|;br", "");
//Get rid of all &xxx; such as "
text = text.replaceAll("&(.*?);", "");
//Get rid of tags such as ";hr width=50%/" and also ;hr width=50%'/ or ;hr width=50%''/
text = text.replaceAll(";(.*?)%'?", "");
//Remove all table of contents {|text-center xxx |}
text = text.replaceAll("\\{\\|[^<>]*\\|\\}", "");
//Remove all "w:" links
text = text.replaceAll("w:", "");
//Remove all "wikt:" links
text = text.replaceAll("wikt:", "");
//Get rid of all ==Links==
//regex: ==[^<>]*==
text = text.replaceAll("==.*==", "");
//Get rid of all ;!-- xxx-- such as ;!-- START TABLE OF CONTENTS --
text = text.replaceAll(";!--(.*?)--", "");
//Remove all __TAG__
text = text.replaceAll("__(.*?)__", "");
//Remove lines with only a /
text = text.replaceAll("\\s/\\s", "");
//Get rid of all ;xxxx; tags There are alot!
text=text.replaceAll(";.*;", "");
//Get rid of ;p tags
text=text.replaceAll(";p", "");
//Get rid of all Category:xxxx
text = text.replaceAll("Category:.*", "");
//Remove empty lines
//text = text.replaceAll("[\\\r\\\n]+","");
}
/**
* @return the curDocNumber
*/
public int getDocNumber() {
return docNumber;
}
/**
* @param curDocNumber the curDocNumber to set
*/
public void setDocNumber(int curDocNumber) {
this.docNumber = curDocNumber;
}
private static String getHeader(int curDocNumber) {
return "<DOC>\n" + "<DOCNO>wikiquote-trec-" + curDocNumber
+ "</DOCNO>\n";
}
private static String getFooter() {
return "</DOC>\n\n";
}
public void reset() {
this.title = "";
this.text = "";
this.trash = false;
this.docNumber = 0;
}
}