WikipediaMarkupCleaner.java example

Explorer
marytts-master
/**
 * Copyright 2007 DFKI GmbH.
 * All Rights Reserved.  Use is subject to license terms.
 *
 * This file is part of MARY TTS.
 *
 * MARY TTS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, version 3 of the License.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 *
 */
package marytts.tools.dbselection;

import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Scanner;
import java.util.Vector;

import org.apache.commons.lang.StringEscapeUtils;

/**
 * WikipediaMarkupCleaner
 * 
 * @author Marcela Charfuelan.
 */
public class WikipediaMarkupCleaner {

	// locale
	private String locale = null;
	// mySql database
	private String mysqlHost = null;
	private String mysqlDB = null;
	private String mysqlUser = null;
	private String mysqlPasswd = null;
	// Wikipedia files:
	private String xmlWikiFile = null;
	private String wikiLog = null;
	private boolean debug = false;
	private String debugPageId = null;
	// Default settings for max page length and min and max text length
	private int minPageLength = 10000; // minimum size of a wikipedia page, to be used in the first filtering of pages
	private int minTextLength = 1000;
	private int maxTextLength = 15000; // the average length in one big xml file is approx. 12000

	// Use this variable to save time not loading Wiki tables, if they already exist in the DB
	private boolean loadWikiTables = true;

	// Use this variable to do NOT create a new cleanText table, but adding to an already existing cleanText table.
	private boolean deleteCleanTextTable = true;

	public void setLocale(String str) {
		locale = str;
	}

	public void setMysqlHost(String str) {
		mysqlHost = str;
	}

	public void setMysqlDB(String str) {
		mysqlDB = str;
	}

	public void setMysqlUser(String str) {
		mysqlUser = str;
	}

	public void setMysqlPasswd(String str) {
		mysqlPasswd = str;
	}

	public void setXmlWikiFile(String str) {
		xmlWikiFile = str;
	}

	public void setWikiLog(String str) {
		wikiLog = str;
	}

	public void setTestId(String str) {
		debugPageId = str;
	}

	public void setMinPageLength(int val) {
		minPageLength = val;
	}

	public void setMinTextLength(int val) {
		minTextLength = val;
	}

	public void setMaxTextLength(int val) {
		maxTextLength = val;
	}

	public void setDebug(boolean bval) {
		debug = bval;
	}

	public void setLoadWikiTables(boolean bval) {
		loadWikiTables = bval;
	}

	public void setDeleteCleanTextTable(boolean bval) {
		deleteCleanTextTable = bval;
	}

	public String getLocale() {
		return locale;
	}

	public String getMysqlHost() {
		return mysqlHost;
	}

	public String getMysqlDB() {
		return mysqlDB;
	}

	public String getMysqlUser() {
		return mysqlUser;
	}

	public String getMysqlPasswd() {
		return mysqlPasswd;
	}

	public String getXmlWikiFile() {
		return xmlWikiFile;
	}

	public String getWikiLog() {
		return wikiLog;
	}

	public String getTestId() {
		return debugPageId;
	}

	public int getMinPageLength() {
		return minPageLength;
	}

	public int getMinTextLength() {
		return minTextLength;
	}

	public int getMaxTextLength() {
		return maxTextLength;
	}

	public boolean getDebug() {
		return debug;
	}

	public boolean getLoadWikiTables() {
		return loadWikiTables;
	}

	public boolean getDeleteCleanTextTable() {
		return deleteCleanTextTable;
	}

	public Vector<String> removeMarkup(String page) {
		StringBuffer str = new StringBuffer("");
		StringBuffer line = null;
		Vector<String> textList = new Vector<String>();

		boolean endOfText = false;
		Scanner s = null;
		try {
			s = new Scanner(page);
			while (s.hasNext() && !endOfText) {

				line = new StringBuffer(s.nextLine());
				// process text until it finds any of these labels:
				if (line.indexOf("==References") >= 0 || line.indexOf("== References") >= 0 || line.indexOf("==See also") >= 0
						|| line.indexOf("== See also") >= 0 || line.indexOf("==External links and sources") >= 0
						|| line.indexOf("==External links") >= 0 || line.indexOf("== External links") >= 0
						|| line.indexOf("== External Links") >= 0 || line.indexOf("== External links and sources") >= 0
						|| line.indexOf("==Notes") >= 0 || line.indexOf("== Notes") >= 0 || line.indexOf("==Sources") >= 0
						|| line.indexOf("== Sources") >= 0 || line.indexOf("==Foreign") >= 0 || line.indexOf("== Foreign") >= 0
						|| line.indexOf("==Discussion") >= 0) {
					endOfText = true;
				} else {
					// when removing sections it might add more lines that might contain again more labels to remove
					boolean clean = false;
					while (!clean && line.length() > 0) {
						clean = true;
						if (line.indexOf("<noinclude") >= 0) {
							line = removeSection(s, line, "<noinclude", "</noinclude>");
							clean = false;
						}

						if (line.indexOf("<includeonly") >= 0) {
							line = removeSection(s, line, "<includeonly", "</includeonly>");
							clean = false;
						}

						if (line.indexOf("<onlyinclude") >= 0) {
							line = removeSection(s, line, "<onlyinclude", "</onlyinclude>");
							clean = false;
						}

						if (line.indexOf("<table") >= 0) { // tables
							line = removeSection(s, line, "<table", "</table>");
							clean = false;
						}

						if (line.indexOf("<TABLE") >= 0) {
							line = removeSection(s, line, "<TABLE", "</TABLE>");
							clean = false;
						}

						if (line.indexOf("{{col-begin}}") >= 0) {
							line = removeSection(s, line, "{{col-begin}}", "{{col-end}}");
							clean = false;
						}

						if (line.indexOf("{|") >= 0) { // this is a table, this should go before {{ because a table can contain {{
														// }}
							line = removeSectionTable(s, line, "{|", "|}");
							clean = false;
						}

						if (line.indexOf("<ref") >= 0) { // references
							line = removeSectionRef(s, line); // This is special because it can be <ref>, <ref, </ref> or />
							clean = false;
						}

						if (line.indexOf("<REF") >= 0) {
							line = removeSection(s, line, "<REF", "</REF>");
							clean = false;
						}

						if (line.indexOf("<Ref") >= 0) {
							line = removeSection(s, line, "<Ref", "</Ref>");
							clean = false;
						}
						if (line.indexOf("<reF") >= 0) {
							line = removeSection(s, line, "<reF", "</reF>");
							clean = false;
						}

						if (line.indexOf("{{start box}}") >= 0) {
							line = removeSection(s, line, "{{start box}}", "{{end box}}");
							clean = false;
						}

						if (line.indexOf("{{") >= 0) {
							line = removeSection(s, line, "{{", "}}");
							clean = false;
						}

						if (line.indexOf("<!--") >= 0) {
							line = removeSection(s, line, "<!--", "-->");
							clean = false;
						}

						if (line.indexOf("\\mathrel{|") >= 0) {
							line = removeSection(s, line, "\\mathrel{|", "}");
							clean = false;
						}

						if (line.indexOf("<gallery") >= 0) { // gallery might contain several images
							line = removeSection(s, line, "<gallery", "</gallery>");
							clean = false;
						}

						if (line.indexOf("[[Image:") >= 0) {
							line = removeSectionImage(s, line, "[[Image:", "]]");
							clean = false;
						}

						if (line.indexOf("<div") >= 0) { // span and div tags are used to separate images from text
							line = removeSection(s, line, "<div", "</div>");
							clean = false;
						}

						if (line.indexOf("<DIV") >= 0) {
							line = removeSectionImage(s, line, "<DIV", "</DIV>");
							clean = false;
						}

						if (line.indexOf("<span") >= 0) {
							line = removeSection(s, line, "<span", "</span>");
							clean = false;
						}

						if (line.indexOf("<math>") >= 0) {
							line = removeSection(s, line, "<math>", "</math>");
							clean = false;
						}

						if (line.indexOf("<timeline>") >= 0) {
							line = removeSection(s, line, "<timeline>", "</timeline>");
							clean = false;
						}

						if (line.indexOf("<nowiki") >= 0) {
							line = removeSection(s, line, "<nowiki", "</nowiki>");
							clean = false;
						}

						if (line.indexOf("<source") >= 0) {
							line = removeSection(s, line, "<source", "</source>");
							clean = false;
						}

						if (line.indexOf("<code") >= 0) {
							line = removeSection(s, line, "<code", "</code>");
							clean = false;
						}

						if (line.indexOf("<imagemap") >= 0) {
							line = removeSection(s, line, "<imagemap", "</imagemap>");
							clean = false;
						}

						if (line.indexOf("<poem") >= 0) {
							line = removeSection(s, line, "<poem", "</poem>");
							clean = false;
						}

						if (line.indexOf("<h1") >= 0) {
							line = removeSection(s, line, "<h1", "</h1>");
							clean = false;
						}

						if (line.indexOf("<pre") >= 0) {
							line = removeSection(s, line, "<pre", "</pre>");
							clean = false;
						}

					} // while the line/text is not clean (or does not have tags to remove)

					// here filter bulleted and numbered short lines
					if (line.length() > 0) {
						if ((line.toString().startsWith("*") || line.toString().startsWith("#")
								|| line.toString().startsWith(";") || line.toString().startsWith(".")
								|| line.toString().startsWith(",") || line.toString().startsWith("&")
								|| line.toString().startsWith("}") || line.toString().startsWith("]")
								|| line.toString().startsWith("|") || line.toString().startsWith("ca:")
								|| line.toString().startsWith("cs:") || line.toString().startsWith("de:")
								|| line.toString().startsWith("es:") || line.toString().startsWith("fr:")
								|| line.toString().startsWith("it:") || line.toString().startsWith("hu:")
								|| line.toString().startsWith("ja:") || line.toString().startsWith("no:")
								|| line.toString().startsWith("pt:") || line.toString().startsWith("sl:")
								|| line.toString().startsWith("fi:") || line.toString().startsWith("sv:")
								|| line.toString().startsWith("tr:") || line.toString().startsWith("zh:")
								|| line.toString().startsWith("Category:") || line.toString().startsWith("!style=")
								|| line.toString().startsWith("!  style=") || line.toString().startsWith("!align=")
								|| line.toString().startsWith("::<code") || line.toString().endsWith("]]"))
								&& line.length() < 200)
							line = new StringBuffer("");
					}
					// Now if the line is not empty, remove:
					// '''''bold & italic'''''
					// '''bold'''
					// ''italic''
					// Internal links:
					// [[Name of page]]
					// [[Name of page|Text to display]]
					// External links:
					// [http://www.example.org Text to display]
					// [http://www.example.org]
					// http://www.example.org
					if (line.length() > 0) {

						line = new StringBuffer(line.toString().replaceAll("'''''", ""));
						line = new StringBuffer(line.toString().replaceAll("'''", ""));
						line = new StringBuffer(line.toString().replaceAll("''", ""));

						line = processInternalAndExternalLinks(line);

						// this will convert HTML   – etc.
						String strlineNoHTML = StringEscapeUtils.unescapeHtml(line.toString());
						line = new StringBuffer(strlineNoHTML);

						// The previous does not remove all HTML stuff, so here it is done some manually
						line = new StringBuffer(line.toString().replaceAll("<big>", ""));
						line = new StringBuffer(line.toString().replaceAll("</big>", ""));
						line = new StringBuffer(line.toString().replaceAll("<blockquote>", ""));
						line = new StringBuffer(line.toString().replaceAll("</blockquote>", ""));
						line = new StringBuffer(line.toString().replaceAll("<BLOCKQUOTE>", ""));
						line = new StringBuffer(line.toString().replaceAll("</BLOCKQUOTE>", ""));
						line = new StringBuffer(line.toString().replaceAll("<sup>", ""));
						line = new StringBuffer(line.toString().replaceAll("</sup>", ""));
						line = new StringBuffer(line.toString().replaceAll("<sub>", ""));
						line = new StringBuffer(line.toString().replaceAll("</sub>", ""));
						line = new StringBuffer(line.toString().replaceAll("<small>", ""));
						line = new StringBuffer(line.toString().replaceAll("</small>", ""));
						line = new StringBuffer(line.toString().replaceAll("<ul>", ""));
						line = new StringBuffer(line.toString().replaceAll("</ul>", ""));
						line = new StringBuffer(line.toString().replaceAll("<UL>", ""));
						line = new StringBuffer(line.toString().replaceAll("</UL>", ""));
						line = new StringBuffer(line.toString().replaceAll("<br>", ""));
						line = new StringBuffer(line.toString().replaceAll("<br", ""));
						line = new StringBuffer(line.toString().replaceAll("<BR>", ""));
						line = new StringBuffer(line.toString().replaceAll("<br", ""));
						line = new StringBuffer(line.toString().replaceAll("<br/>", ""));
						line = new StringBuffer(line.toString().replaceAll("<Center>", ""));
						line = new StringBuffer(line.toString().replaceAll("<center>", ""));
						line = new StringBuffer(line.toString().replaceAll("</center>", ""));
						line = new StringBuffer(line.toString().replaceAll("<CENTER>", ""));
						line = new StringBuffer(line.toString().replaceAll("</CENTER>", ""));
						line = new StringBuffer(line.toString().replaceAll("<cite>", ""));
						line = new StringBuffer(line.toString().replaceAll("</cite>", ""));
						line = new StringBuffer(line.toString().replaceAll("<li>", ""));
						line = new StringBuffer(line.toString().replaceAll("</li>", ""));
						line = new StringBuffer(line.toString().replaceAll("<LI>", ""));
						line = new StringBuffer(line.toString().replaceAll("</LI>", ""));
						line = new StringBuffer(line.toString().replaceAll("<dl>", ""));
						line = new StringBuffer(line.toString().replaceAll("</dl>", ""));
						line = new StringBuffer(line.toString().replaceAll("<dt>", ""));
						line = new StringBuffer(line.toString().replaceAll("</dt>", ""));
						line = new StringBuffer(line.toString().replaceAll("<dd>", ""));
						line = new StringBuffer(line.toString().replaceAll("</dd>", ""));
						line = new StringBuffer(line.toString().replaceAll("<b>", ""));
						line = new StringBuffer(line.toString().replaceAll("</b>", ""));
						line = new StringBuffer(line.toString().replaceAll("<p>", ""));
						line = new StringBuffer(line.toString().replaceAll("</p>", ""));
						line = new StringBuffer(line.toString().replaceAll("<u>", ""));
						line = new StringBuffer(line.toString().replaceAll("</u>", ""));
						line = new StringBuffer(line.toString().replaceAll("<tt>", ""));
						line = new StringBuffer(line.toString().replaceAll("</tt>", ""));
						line = new StringBuffer(line.toString().replaceAll("<i>", ""));
						line = new StringBuffer(line.toString().replaceAll("</i>", ""));
						line = new StringBuffer(line.toString().replaceAll("<I>", ""));
						line = new StringBuffer(line.toString().replaceAll("</I>", ""));
						line = new StringBuffer(line.toString().replaceAll("<s>", ""));
						line = new StringBuffer(line.toString().replaceAll("</s>", ""));
						line = new StringBuffer(line.toString().replaceAll("<em>", ""));
						line = new StringBuffer(line.toString().replaceAll("</em>", ""));
						line = new StringBuffer(line.toString().replaceAll("</br>", ""));
						line = new StringBuffer(line.toString().replaceAll("</div>", ""));
						line = new StringBuffer(line.toString().replaceAll("</ref>", ""));
						line = new StringBuffer(line.toString().replaceAll("/>", ""));

						// Removing quotation marks
						line = new StringBuffer(line.toString().replaceAll("\"", ""));
						// these quotations have a strange/problematic symbol different from "
						line = new StringBuffer(line.toString().replaceAll("“", ""));
						line = new StringBuffer(line.toString().replaceAll("”", ""));
						// these symbol are also problematic, here they are changed.
						line = new StringBuffer(line.toString().replaceAll("’", "'"));
						line = new StringBuffer(line.toString().replaceAll("—", "-"));
						line = new StringBuffer(line.toString().replaceAll("–", "-"));

						line = new StringBuffer(line.toString().replaceAll(" ", " "));
						line = new StringBuffer(line.toString().replaceAll("…", " "));

						// finally sections and lists
						boolean is_title = false;
						if (line.toString().startsWith("==")) {
							is_title = true;
						}
						line = new StringBuffer(line.toString().replaceAll("\\s*==+$|==+", ""));
						if (is_title) {
							line.append(".");
						}

						// bulleted list and numbered list
						if (line.toString().startsWith("***") || line.toString().startsWith("*#*"))
							line.replace(0, 3, "");
						if (line.toString().startsWith("**") || line.toString().startsWith(":*")
								|| line.toString().startsWith("*#") || line.toString().startsWith("##")
								|| line.toString().startsWith("::"))
							line.replace(0, 2, "");
						if (line.toString().startsWith("*") || line.toString().startsWith("#"))
							line.replace(0, 1, "");
						if (line.toString().startsWith(";") || line.toString().startsWith(";")) // in glossaries definitions start
																								// with ;
							line.replace(0, 1, "");

						// remove this when the text is almost clean
						if (line.indexOf("<font") >= 0)
							line = removeSection(s, line, "<font", ">");
						line = new StringBuffer(line.toString().replaceAll("</font>", ""));

						if (line.indexOf("<blockquote") >= 0)
							line = removeSection(s, line, "<blockquote", ">");

						if (line.indexOf("<ol") >= 0)
							line = removeSection(s, line, "<ol", ">");

						if (line.indexOf("<http:") >= 0)
							line = removeSection(s, line, "<http:", ">");

						// finally concatenate the line
						str.append(line);
						if (!str.toString().endsWith("\n"))
							str.append("\n");

						line = null;

						// check length of the text
						if (str.length() > maxTextLength) {
							textList.add(str.toString());
							// System.out.println("\n-----------\n" + str.toString());
							str = new StringBuffer("");
						}

					}

				} // endOfText=false

			} // while has more lines

		} finally {
			if (s != null)
				s.close();
		}

		if (!str.toString().contentEquals(""))
			textList.add(str.toString());
		return textList;
	}

	// This is special because it can be:
	// <ref> ... </ref>
	// <ref ... </ref>
	// <ref ... />
	private StringBuffer removeSectionRef(Scanner s, StringBuffer lineIn) {
		String next;
		int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0;
		boolean closeRef = true;
		StringBuffer line = new StringBuffer(lineIn);
		StringBuffer nextLine;

		while ((index1 = line.indexOf("<ref")) >= 0) { // in one line can be more than one reference
			numRef++;
			if ((index2 = line.indexOf("</ref>", index1)) >= 0)
				endTagLength = 6 + index2;
			else if ((index3 = line.indexOf("/>", index1)) >= 0)
				endTagLength = 2 + index3;

			if (index2 == -1 && index3 == -1) {// the </ref> most be in the next lines, so get more lines until the </ref> is
												// found
				while (s.hasNext() && numRef != 0) {
					nextLine = new StringBuffer(s.nextLine());
					if (nextLine.indexOf("<ref") >= 0)
						numRef++;
					line.append(nextLine);
					if ((index2 = line.indexOf("</ref>", index1)) >= 0) {
						numRef--;
						endTagLength = 6 + index2;
					} else if ((index3 = line.indexOf("/>", index1)) >= 0) {
						numRef--;
						endTagLength = 2 + index3;
					}
				}

			} else
				// the endTag was found
				numRef--;

			if (numRef == 0) {
				index1 = line.indexOf("<ref"); // get again this because the position might change
				if (endTagLength > index1) {
					line.delete(index1, endTagLength);
					// System.out.println("nextline="+line);
				} else {
					if (debug) {
						System.out.print("iniTag: <ref  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("removeSectionRef: WARNING endTagLength > length of line: " + line);
						// line.delete(index1, line.length());
					}
					line = new StringBuffer("");
				}
			} else {
				if (debug)
					System.out.println("removeSectionRef: WARNING no </ref> or /> in " + line);
				// line.delete(index1, line.length());
				line = new StringBuffer("");
			}

		} // while this line contains iniTag-s

		return line;

	}

	private StringBuffer removeSection(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
		String next;
		int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0;
		boolean closeRef = true;
		StringBuffer line = new StringBuffer(lineIn);
		StringBuffer nextLine;

		if (debug)
			System.out.println("Removing tag: " + iniTag + "  LINE (BEFORE): " + line);

		while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag

			numRef++;
			if ((index2 = line.indexOf(endTag, index1)) >= 0)
				endTagLength = endTag.length() + index2;

			if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found
				lastEndTag = 0; // start to look for the endTag in 0

				while (s.hasNext() && numRef != 0) {
					lastIniTag = 0;
					nextLine = new StringBuffer(s.nextLine());
					// if(debug)
					// System.out.println("  NEXTLINE: " + nextLine);

					while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) {
						numRef++;
						lastIniTag = iniTag.length() + index1;
					}

					line.append(nextLine);

					// next time it will look for the endTag after the position of the last it found.
					while ((index2 = line.indexOf(endTag, lastEndTag)) >= 0) {
						numRef--;
						lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found
						endTagLength = endTag.length() + index2;
					}

					// if(debug)
					// System.out.println("LINE (numRef=" + numRef + "): " + line);
				}
			} else
				// the endTag was found
				numRef--;

			if (numRef == 0) {
				index1 = line.indexOf(iniTag); // get again this because the position might change
				if (endTagLength > index1) {
					if (debug) {
						System.out.println("    FINAL LINE: " + line);
						System.out.print("iniTag: " + iniTag + "  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("  line.length=" + line.length());
					}
					line.delete(index1, endTagLength);
				} else {
					if (debug) {
						System.out.println("removeSection: WARNING endTagLength > length of line: ");
						System.out.print("iniTag: " + iniTag + "  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
					}
					line = new StringBuffer("");
				}

				// System.out.println("nextline="+line);
			} else {
				if (debug)
					System.out.println("removeSection: WARNING no " + endTag);
				line = new StringBuffer("");
			}

		} // while this line contains iniTag-s

		if (debug)
			System.out.println("    LINE (AFTER): " + line);
		return line;
	}

	private StringBuffer removeSectionTable(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
		String next;
		int index1 = 0, index2 = -1, endTagLength = 0, numRef = 0, lastEndTag = 0, lastIniTag = 0;
		boolean closeRef = true;
		StringBuffer line = new StringBuffer(lineIn);
		StringBuffer nextLine;

		if (debug)
			System.out.println("Removing tag: " + iniTag + "  LINE (BEFORE): " + line);

		while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag

			numRef++;
			if ((index2 = line.indexOf(endTag, index1)) >= 0)
				endTagLength = endTag.length() + index2;

			if (index2 == -1) {// the iniTag most be in the next lines, so get more lines until the endTag is found
				lastEndTag = 0; // start to look for the endTag in 0

				while (s.hasNext() && numRef != 0) {
					lastIniTag = 0;
					nextLine = new StringBuffer(s.nextLine());
					// if(debug)
					// System.out.println("  NEXTLINE: " + nextLine);

					while ((index1 = nextLine.indexOf(iniTag, lastIniTag)) >= 0) {
						numRef++;
						lastIniTag = iniTag.length() + index1;
					}
					// next time it will look for the endTag after the position of the last it found.
					// while( (index2 = line.indexOf(endTag, lastEndTag)) >= 0 ){
					if (nextLine.toString().startsWith(endTag)) {
						numRef--;
						// index2 = line.length();
						// lastEndTag = index2 + endTag.length(); // I need to remember where the last endTag was found
						endTagLength = line.length() + endTag.length();
					}

					line.append(nextLine);

					// if(debug)
					// System.out.println("LINE (numRef=" + numRef + "): " + line);
				}
			} else
				// the endTag was found
				numRef--;

			if (numRef == 0) {
				index1 = line.indexOf(iniTag); // get again this because the position might change
				if (endTagLength > index1) {
					if (debug) {
						System.out.println("    FINAL LINE: " + line);
						System.out.print("iniTag: " + iniTag + "  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("  line.length=" + line.length());
					}
					line.delete(index1, endTagLength);
				} else {
					if (debug) {
						System.out.println("removeSection: WARNING endTagLength > length of line: ");
						System.out.print("iniTag: " + iniTag + "  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
					}
					line = new StringBuffer("");
				}

				// System.out.println("nextline="+line);
			} else {
				if (debug)
					System.out.println("removeSection: WARNING no " + endTag);
				line = new StringBuffer("");
			}

		} // while this line contains iniTag-s

		if (debug)
			System.out.println("    LINE (AFTER): " + line);
		return line;
	}

	/****
	 * This is also special because the line might contain sections with [[ ... ]] so the ]] after a [[ is not the endTag of
	 * [[image: ... ]]
	 * 
	 * @param s
	 *            s
	 * @param lineIn
	 *            lineIn
	 * @param iniTag
	 *            iniTag
	 * @param endTag
	 *            endTag
	 * @return line
	 */
	private StringBuffer removeSectionImage(Scanner s, StringBuffer lineIn, String iniTag, String endTag) {
		String next;
		int index1 = 0, index2 = -1, index3 = -1, endTagLength = 0, numRef = 0, lastEndTag1 = 0, lastIniTag = 0;
		boolean closeRef = true;
		StringBuffer line = new StringBuffer(lineIn);
		StringBuffer nextLine;
		StringBuffer aux;

		if (debug)
			System.out.println("Removing tag: " + iniTag + "  LINE (BEFORE): " + line);

		while ((index1 = line.indexOf(iniTag)) >= 0) { // in one line can be more than one iniTag

			numRef++;
			index3 = endTagLength = index1;

			while (s.hasNext() && numRef > 0) {

				while ((index2 = line.indexOf("]]", endTagLength)) >= 0 && numRef > 0) {
					aux = new StringBuffer(line.subSequence(index1 + 2, index2 + 2));
					if (debug)
						System.out.println("    aux=" + aux);
					if ((index3 = aux.indexOf("[[")) == -1) {
						endTagLength = endTag.length() + index2;
						numRef--;
					} else { // The previous was a [[ ]] inside of a [[Image: so it has to be deleted
						index1 = index2;
						endTagLength = index2 + 2;
						index2 = -1;
					}
				}
				// so far it has not found the endTag, so get another line
				if (numRef > 0)
					line.append(s.nextLine());
			}

			if (numRef == 0) {
				index1 = line.indexOf(iniTag); // get again this because the position might change
				if (endTagLength > index1) {
					if (debug) {
						System.out.println("    FINAL LINE: " + line);
						System.out.print("iniTag: " + iniTag + "  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("  line.length=" + line.length());
					}
					line.delete(index1, endTagLength);
				} else {
					if (debug) {
						System.out.println("removeSection: WARNING endTagLength > length of line: ");
						System.out.print("iniTag: " + iniTag + "  index1=" + index1);
						System.out.print("  endTagLength=" + endTagLength);
						System.out.println("  line.length=" + line.length() + "  line: " + line);
						System.out.println("removeSection: WARNING endTagLength > length of line: " + line);
					}
					line = new StringBuffer("");
				}

			} else {
				if (debug)
					System.out.println("removeSection: WARNING no " + endTag);
				line = new StringBuffer("");
			}

		} // while this line contains iniTag-s

		if (debug)
			System.out.println("    LINE (AFTER): " + line);
		return line;
	}

	/***
	 * Internal links: [[Name of page]] [[Name of page|Text to display]] External links: [http://www.example.org Text to display]
	 * [http://www.example.org] http://www.example.org
	 * 
	 * @param line
	 *            line
	 */
	private StringBuffer processInternalAndExternalLinks(StringBuffer line) {
		int index1, index2, index3;
		StringBuffer linetmp = null; // for debugging
		boolean changed = false;
		if (debug)
			linetmp = new StringBuffer(line);

		// Internal links:
		while ((index1 = line.indexOf("[[")) >= 0) {
			changed = true;
			if ((index2 = line.indexOf("]]")) >= 0) {
				if ((index3 = line.indexOf("|", index1)) >= 0 && index3 < index2) { // if there is text to display
					line.delete(index1, index3 + 1); // delete the link and [[ ]]
					index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]]
					line.delete(index2, index2 + 2);
				} else {
					line.delete(index1, index1 + 2); // delete the [[
					index2 = line.indexOf("]]"); // since i delete some text i need to find again the next ]]
					line.delete(index2, index2 + 2); // delete the ]] -2 because in the previous line i deleted two chars
				}
				// if(debug)
				// System.out.println("LINE (AFTER): " + line);

			} else {
				if (debug) {
					System.out.println("processInternalAndExternalLinks: WARNING no ]] tag in " + line);
					System.out.println("deleting [[");
				}
				line.delete(index1, index1 + 2); // delete the [[
			}
		}

		// External links: just the ones started with [http: and here I am deleting the whole reference
		// i am not keeping the text to display of this link.
		while ((index1 = line.indexOf("[http:")) >= 0 || (index1 = line.indexOf("[https:")) >= 0) {
			// System.out.println("LINE(BEFORE): " + line);
			if ((index2 = line.indexOf("]", index1)) >= 0) {
				// line.delete(index1, index2+1);
				if ((index3 = line.indexOf(" ", index1)) >= 0 && index3 < index2) { // if there is text to display
					line.delete(index1, index3 + 1); // delete the link and [http: until first black space before ]
					index2 = line.indexOf("]"); // since i delete some text i need to find again the next ]]
					line.delete(index2, index2 + 1);
				} else {
					line.delete(index1, index2 + 1); // no text to display, delete the whole ref
				}

				// System.out.println("LINE (AFTER): " + line + "\n");

			} else {
				if (debug) {
					System.out.println("processInternalAndExternalLinks: WARNING no ] tag when processing lines with http: line="
							+ line);
					System.out.println("deleting [");
				}
				line.delete(index1, index1 + 1); // delete the [
			}
		}

		if (debug && changed) {
			System.out.println("Removing links, LINE(BEFORE): " + linetmp);
			System.out.println("                LINE (AFTER): " + line);
		}

		return line;

	}

	public void addWordToHashMap(String text, HashMap<String, Integer> wordList) {
		String sentences[];
		String words[], w;
		Integer i;
		int m, n;

		sentences = text.split("\n");
		for (m = 0; m < sentences.length; m++) {
			// System.out.println("\n" + sentences[m]);
			words = sentences[m].split(" ");
			for (n = 0; n < words.length; n++) {
				w = words[n];
				// System.out.print("word=" + words[n] + "   -->");
				// Split into letter sections that we will consider atomic "words":
				int start = 0, end = 0;
				int minimumLength = 2;
				for (; end < w.length(); end++) {
					// if (Character.isLetter(w.charAt(end))) {
					if (marytts.util.string.StringUtils.isLetterOrModifier(w.codePointAt(end))) {
						if (start < 0)
							start = end;
						continue;
					}
					// not a letter
					if (start >= 0 && end - start >= minimumLength) {
						String oneWord = w.substring(start, end);
						// System.out.print(" oneWord1=" + oneWord);
						Integer count = (Integer) wordList.get(oneWord);
						// if key is not in the map then give it value one
						// otherwise increment its value by 1
						if (count == null)
							wordList.put(oneWord, new Integer(1));
						else
							wordList.put(oneWord, new Integer(count.intValue() + 1));
					}
					start = -1;
				}
				if (start >= 0 && end - start >= minimumLength) {
					String oneWord = w.substring(start, end);
					// System.out.print(" oneWord2=" + oneWord);
					Integer count = (Integer) wordList.get(oneWord);
					// if key is not in the map then give it value one
					// otherwise increment its value by 1
					if (count == null)
						wordList.put(oneWord, new Integer(1));
					else
						wordList.put(oneWord, new Integer(count.intValue() + 1));
				}
				/*
				 * // remove punctuation if( w.endsWith(",") || w.endsWith(";") || w.endsWith(".") || w.endsWith(":") ||
				 * w.endsWith("'") || w.endsWith(")") || w.endsWith("?") ) w = w.substring(0, (w.length()-1)); if(
				 * w.endsWith("'s") ) w = w.substring(0, (w.length()-2)); if(w.startsWith("(") ) w = w.substring(1, w.length());
				 * 
				 * if( w.length()>1 && StringUtils.isAlpha(w) && StringUtils.isNotBlank(w) && StringUtils.isNotEmpty(w) &&
				 * StringUtils.isAsciiPrintable(w)) { //System.out.print(w + " "); i = (Integer) wordList.get(w); // if key is not
				 * in the map then give it value one // otherwise increment its value by 1 if(i==null) wordList.put(w, new
				 * Integer(1)); else wordList.put(w, new Integer( i.intValue() + 1)); } // if word is > 1 and isAlpha
				 */
				// System.out.println("\n");
			}
			// System.out.println("\n");
			words = null;
		}
		sentences = null;
	}

	public void updateWordList(DBHandler wikiToDB, HashMap<String, Integer> wlNew) {
		String w;
		HashMap<String, Integer> wlOld;
		Integer freq;
		Integer i;

		// Checking if word list exist
		if (wikiToDB.tableExist(locale + "_wordList")) {
			System.out.println("Updating " + locale + "_wordList in DB table....");
			wlOld = wikiToDB.getMostFrequentWords(0, 0);

			// combine the two tables
			Iterator iterator = wlNew.keySet().iterator();
			while (iterator.hasNext()) {
				w = iterator.next().toString();
				freq = wlNew.get(w);

				i = (Integer) wlOld.get(w);
				// if key is not in the map then give it value freq
				// otherwise increment its value by freq
				if (i == null)
					wlOld.put(w, new Integer(freq));
				else
					wlOld.put(w, new Integer(i.intValue() + freq));
			}
			wikiToDB.insertWordList(wlOld);
			System.out.println("Final size of wordList after combining old and new lists: wordList=[" + wlOld.size() + "]");

		} else {
			System.out.println("Saving " + locale + "_wordList table....");
			wikiToDB.insertWordList(wlNew);
		}

	}

	void processWikipediaSQLTablesDebug() throws Exception {

		DBHandler wikiToDB = new DBHandler(locale);

		wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
		String text;
		StringBuilder textId = new StringBuilder();
		int numPagesUsed = 0;

		PrintWriter pw = null;
		if (wikiLog != null)
			pw = new PrintWriter(new FileWriter(new File(wikiLog)));

		// get text from the DB
		text = wikiToDB.getTextFromWikiPage(debugPageId, minPageLength, textId, pw);
		System.out.println("\nPAGE SIZE=" + text.length() + "  text:\n" + text);

		Vector<String> textList;

		if (text != null) {
			textList = removeMarkup(text);
			System.out.println("\nCLEANED TEXT:");
			for (int i = 0; i < textList.size(); i++)
				System.out.println("text(" + i + "): \n" + textList.get(i));

		} else
			System.out.println("NO CLEANED TEXT.");

		if (pw != null)
			pw.close();

		wikiToDB.closeDBConnection();

	}

	/***
	 * Using mwdumper extracts pages from a xmlWikiFile and load them in a mysql DB (it loads the tables "locale_text",
	 * "locale_page" and "locale_revision", where locale is the corresponding wikipedia language). Once the tables are loaded,
	 * extract/clean text from the pages and create a cleanText table. It also creates a wordList table including frequencies.
	 * 
	 * @throws Exception
	 *             Exception
	 */
	void processWikipediaPages() throws Exception {
		// Load wikipedia pages, extract clean text and create word list.
		String dateStringIni = "", dateStringEnd = "";
		DateFormat fullDate = new SimpleDateFormat("dd_MM_yyyy_HH:mm:ss");
		Date dateIni = new Date();
		dateStringIni = fullDate.format(dateIni);

		DBHandler wikiToDB = new DBHandler(locale);

		// hashMap for the dictionary, HashMap is faster than TreeMap so the list of words will
		// be kept it in a hashMap. When the process finish the hashMap will be dump in the database.
		HashMap<String, Integer> wordList;

		System.out.println("Creating connection to DB server...");
		wikiToDB.createDBConnection(mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);

		// This loading can take a while
		// create and load TABLES: page, text and revision

		if (loadWikiTables) {
			System.out.println("Creating and loading TABLES: page, text and revision. (The loading can take a while...)");
			wikiToDB.loadPagesWithMWDumper(xmlWikiFile, locale, mysqlHost, mysqlDB, mysqlUser, mysqlPasswd);
		} else {
			// Checking if tables are already created and loaded in the DB
			if (wikiToDB.checkWikipediaTables())
				System.out.println("TABLES " + locale + "_page, " + locale + "_text and " + locale
						+ "_revision already loaded (WARNING USING EXISTING WIKIPEDIA TABLES).");
			else
				throw new Exception("WikipediaMarkupCleaner: ERROR IN TABLES " + locale + "_page, " + locale + "_text and "
						+ locale + "_revision, they are not CREATED/LOADED.");
		}

		System.out.println("\nGetting page IDs");
		String pageId[];
		pageId = wikiToDB.getIds("page_id", locale + "_page");
		System.out.println("Number of page IDs to process: " + pageId.length + "\n");

		// create cleanText TABLE
		if (deleteCleanTextTable) {
			System.out.println("Creating (deleting if already exist) " + locale + "_cleanText TABLE");
			wikiToDB.createWikipediaCleanTextTable();
		} else {
			if (wikiToDB.tableExist(locale + "_cleanText"))
				System.out.println(locale + "_cleanText TABLE already exist (ADDING TO EXISTING cleanText TABLE)");
			else {
				System.out.println("Creating " + locale + "_cleanText TABLE");
				wikiToDB.createWikipediaCleanTextTable();
			}
		}

		System.out.println("Starting Hashtable for wordList.");
		int initialCapacity = 200000;
		wordList = new HashMap<String, Integer>(initialCapacity);

		String text;
		PrintWriter pw = null;
		if (wikiLog != null)
			pw = new PrintWriter(new FileWriter(new File(wikiLog)));

		StringBuilder textId = new StringBuilder();
		int numPagesUsed = 0;

		Vector<String> textList;
		System.out.println("\nStart processing Wikipedia pages.... Start time:" + dateStringIni + "\n");

		for (int i = 0; i < pageId.length; i++) {

			// first filter
			text = wikiToDB.getTextFromWikiPage(pageId[i], minPageLength, textId, pw);

			if (text != null) {
				textList = removeMarkup(text);
				numPagesUsed++;
				for (int j = 0; j < textList.size(); j++) {
					text = textList.get(j);
					if (text.length() > minTextLength) {
						// if after cleaning the text is not empty or
						wikiToDB.insertCleanText(text, pageId[i], textId.toString());
						// insert the words in text in wordlist
						addWordToHashMap(text, wordList);
						if (debug)
							System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + "  textList (" + (j + 1) + "/"
									+ textList.size() + ") length=" + text.length() + "  numPagesUsed=" + numPagesUsed
									+ "  Wordlist[" + wordList.size() + "] ");

						if (pw != null)
							pw.println("CLEANED PAGE page_id[" + i + "]=" + pageId[i] + " textList (" + (j + 1) + "/"
									+ textList.size() + ") length=" + text.length() + " Wordlist[" + wordList.size() + "] "
									+ "  NUM_PAGES_USED=" + numPagesUsed + " text:\n\n" + text);
					} else if (pw != null)
						pw.println("PAGE NOT USED AFTER CLEANING page_id[" + i + "]=" + pageId[i] + " length=" + text.length());
				} // for each text in textList
				System.out.println("Cleanedpage_id[" + i + "]=" + pageId[i] + "  numPagesUsed=" + numPagesUsed + "  Wordlist["
						+ wordList.size() + "] ");
				textList.clear(); // clear the list of text
			}
		}
		Date dateEnd = new Date();
		dateStringEnd = fullDate.format(dateEnd);

		if (pw != null) {
			pw.println("Number of PAGES USED=" + numPagesUsed + " Wordlist[" + wordList.size() + "] " + " minPageLength="
					+ minPageLength + " minTextLength=" + minTextLength + " Start time:" + dateStringIni + "  End time:"
					+ dateStringEnd);
			pw.close();
		}

		// save the wordList in the DB
		updateWordList(wikiToDB, wordList);

		wikiToDB.printWordList("./wordlist-freq.txt", "frequency", 0, 0);

		System.out.println("\nNumber of pages used=" + numPagesUsed + " Wordlist[" + wordList.size() + "] " + " Start time:"
				+ dateStringIni + "  End time:" + dateStringEnd);

		// Once created the cleantext table delete the wikipedia text, page and revision tables.
		wikiToDB.deleteWikipediaTables();

		wikiToDB.closeDBConnection();

	}

	private void printParameters() {
		System.out.println("WikipediaMarkupCleaner parameters:" + "\n  -mysqlHost " + getMysqlHost() + "\n  -mysqlUser "
				+ getMysqlUser() + "\n  -mysqlPasswd " + getMysqlPasswd() + "\n  -mysqlDB " + getMysqlDB() + "\n  -xmlFile "
				+ getXmlWikiFile() + "\n  -minPage " + getMinPageLength() + "\n  -minText " + getMinTextLength()
				+ "\n  -maxText " + getMaxTextLength() + "\n  -log " + getWikiLog() + "\n  -debugPageId " + getTestId());

		if (getDebug())
			System.out.println("  -debug true");
		else
			System.out.println("  -debug false");
		if (getLoadWikiTables())
			System.out.println("  -loadWikiTables true");
		else
			System.out.println("  -loadWikiTables false");
		if (getDeleteCleanTextTable())
			System.out.println("  -deleteCleanTextTable true\n");
		else
			System.out.println("  -deleteCleanTextTable false\n");
	}

	//
	/**
	 * Read and parse the command line args
	 * 
	 * @param args
	 *            the args
	 * @return true, if successful, false otherwise
	 */
	private boolean readArgs(String[] args) {

		String help = "\nUsage: java WikipediaMarkupCleaner -locale language -mysqlHost host -mysqlUser user  \n"
				+ "                       -mysqlPasswd passwd -mysqlDB wikiDB -xmlFile xmlWikiFile \n"
				+ "      default/optional: [-minPage 10000 -minText 1000 -maxText 15000] \n"
				+ "      optional: [-log wikiLogFile -id pageId -debug]\n\n"
				+ "      -minPage is the minimum size of a wikipedia page that will be considered for cleaning.\n"
				+ "      -minText is the minimum size of a text to be kept in the DB.\n"
				+ "      -maxText is used to split big articles in small chunks, this is the maximum chunk size. \n"
				+ "      -log the wikiLogFile will contain the cleaned text and information about the pages used.\n"
				+ "      -debug will produce more output and it is mainly used to debug a particular Wikipedia page.\n"
				+ "      -debugPageId is the page_id number in a wikipedia page table (ex. 18702442), when used this option\n"
				+ "           the tables will not be loaded, so it is asumed that page, text and revision tables are already loaded.\n"
				+ "      -noLoadWikiTables use this variable to save time NOT loading wiki tables, they must already exist in the the DB.\n"
				+ "      -noDeleteCleanTextTable use this variable to do NOT create a new cleanText table, but adding to an already existing\n"
				+ "       cleanText table.\n";

		if (args.length >= 12) { // minimum 12 parameters
			for (int i = 0; i < args.length; i++) {
				if (args[i].contentEquals("-locale") && args.length >= (i + 1))
					setLocale(args[++i]);

				else if (args[i].contentEquals("-mysqlHost") && args.length >= (i + 1))
					setMysqlHost(args[++i]);

				else if (args[i].contentEquals("-mysqlUser") && args.length >= (i + 1))
					setMysqlUser(args[++i]);

				else if (args[i].contentEquals("-mysqlPasswd") && args.length >= (i + 1))
					setMysqlPasswd(args[++i]);

				else if (args[i].contentEquals("-mysqlDB") && args.length >= (i + 1))
					setMysqlDB(args[++i]);

				else if (args[i].contentEquals("-xmlFile") && args.length >= (i + 1))
					setXmlWikiFile(args[++i]);

				// From here the arguments are optional
				else if (args[i].contentEquals("-minPage") && args.length >= (i + 1))
					setMinPageLength(Integer.parseInt(args[++i]));

				else if (args[i].contentEquals("-minText") && args.length >= (i + 1))
					setMinTextLength(Integer.parseInt(args[++i]));

				else if (args[i].contentEquals("-maxText") && args.length >= (i + 1))
					setMaxTextLength(Integer.parseInt(args[++i]));

				else if (args[i].contentEquals("-log") && args.length >= (i + 1))
					setWikiLog(args[++i]);

				else if (args[i].contentEquals("-debugPageId") && args.length >= (i + 1))
					setTestId(args[++i]);

				else if (args[i].contentEquals("-debug"))
					setDebug(true);

				// Use this variable to save time NOT loading wiki tables, they must already exist in the DB
				else if (args[i].contentEquals("-noLoadWikiTables"))
					setLoadWikiTables(false);

				// Use this variable to do not create a new cleanText table, but adding to an already existing cleanText table.
				else if (args[i].contentEquals("-noDeleteCleanTextTable"))
					setDeleteCleanTextTable(false);

				else { // unknown argument
					System.out.println("\nOption not known: " + args[i]);
					System.out.println(help);
					return false;
				}

			}
		} else { // num arguments less than 16
			System.out.println(help);
			return false;
		}

		if (getLocale() == null) {
			System.out.println("\nMissing locale.");
			printParameters();
			System.out.println(help);
			return false;
		}

		if (getMysqlHost() == null || getMysqlUser() == null || getMysqlPasswd() == null || getMysqlDB() == null) {
			System.out.println("\nMissing required mysql parameters (one/several required variables are null).");
			printParameters();
			System.out.println(help);
			return false;
		}

		if (getXmlWikiFile() == null) {
			System.out.println("\nMissing required parameter, the XML wikipedia file\n");
			printParameters();
			System.out.println(help);
			return false;
		}

		return true;
	}

	public static void main(String[] args) throws Exception {

		WikipediaMarkupCleaner wikiCleaner = new WikipediaMarkupCleaner();

		/* check the arguments */
		if (!wikiCleaner.readArgs(args))
			return;

		wikiCleaner.printParameters();

		if (wikiCleaner.getTestId() != null)
			wikiCleaner.processWikipediaSQLTablesDebug();
		else
			wikiCleaner.processWikipediaPages();

	}

}