/**
* Copyright 2011 The Open Source Research Group,
* University of Erlangen-Nürnberg
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.languagetool.dev.wikipedia;
import java.util.LinkedList;
import java.util.regex.Pattern;
import org.sweble.wikitext.engine.Page;
import org.sweble.wikitext.engine.PageTitle;
import org.sweble.wikitext.engine.utils.EntityReferences;
import org.sweble.wikitext.engine.utils.SimpleWikiConfiguration;
import org.sweble.wikitext.lazy.LinkTargetException;
import org.sweble.wikitext.lazy.encval.IllegalCodePoint;
import org.sweble.wikitext.lazy.parser.Bold;
import org.sweble.wikitext.lazy.parser.Enumeration;
import org.sweble.wikitext.lazy.parser.EnumerationItem;
import org.sweble.wikitext.lazy.parser.ExternalLink;
import org.sweble.wikitext.lazy.parser.HorizontalRule;
import org.sweble.wikitext.lazy.parser.ImageLink;
import org.sweble.wikitext.lazy.parser.InternalLink;
import org.sweble.wikitext.lazy.parser.Italics;
import org.sweble.wikitext.lazy.parser.Itemization;
import org.sweble.wikitext.lazy.parser.ItemizationItem;
import org.sweble.wikitext.lazy.parser.MagicWord;
import org.sweble.wikitext.lazy.parser.Paragraph;
import org.sweble.wikitext.lazy.parser.Section;
import org.sweble.wikitext.lazy.parser.Url;
import org.sweble.wikitext.lazy.parser.Whitespace;
import org.sweble.wikitext.lazy.parser.XmlElement;
import org.sweble.wikitext.lazy.preprocessor.TagExtension;
import org.sweble.wikitext.lazy.preprocessor.Template;
import org.sweble.wikitext.lazy.preprocessor.TemplateArgument;
import org.sweble.wikitext.lazy.preprocessor.TemplateParameter;
import org.sweble.wikitext.lazy.preprocessor.XmlComment;
import org.sweble.wikitext.lazy.utils.XmlCharRef;
import org.sweble.wikitext.lazy.utils.XmlEntityRef;
import de.fau.cs.osr.ptk.common.Visitor;
import de.fau.cs.osr.ptk.common.ast.AstNode;
import de.fau.cs.osr.ptk.common.ast.NodeList;
import de.fau.cs.osr.ptk.common.ast.Text;
/**
* A visitor to convert an article AST into a pure text representation. To
* better understand the visitor pattern as implemented by the Visitor class,
* please take a look at the following resources:
* <ul>
* <li>{@link http://en.wikipedia.org/wiki/Visitor_pattern} (classic pattern)</li>
* <li>{@link http://www.javaworld.com/javaworld/javatips/jw-javatip98.html}
* (the version we use here)</li>
* </ul>
*
* The methods needed to descend into an AST and visit the children of a given
* node <code>n</code> are
* <ul>
* <li><code>dispatch(n)</code> - visit node <code>n</code>,</li>
* <li><code>iterate(n)</code> - visit the <b>children</b> of node
* <code>n</code>,</li>
* <li><code>map(n)</code> - visit the <b>children</b> of node <code>n</code>
* and gather the return values of the <code>visit()</code> calls in a list,</li>
* <li><code>mapInPlace(n)</code> - visit the <b>children</b> of node
* <code>n</code> and replace each child node <code>c</code> with the return
* value of the call to <code>visit(c)</code>.</li>
* </ul>
*/
public class TextConverter
extends
Visitor
{
private static final Pattern ws = Pattern.compile("\\s+");
private final SimpleWikiConfiguration config;
private final int wrapCol;
private StringBuilder sb;
private StringBuilder line;
private int extLinkNum;
private boolean pastBod;
private int needNewlines;
private boolean needSpace;
private boolean noWrap;
private LinkedList<Integer> sections;
// =========================================================================
public TextConverter(SimpleWikiConfiguration config, int wrapCol)
{
this.config = config;
this.wrapCol = wrapCol;
}
@Override
protected boolean before(AstNode node)
{
// This method is called by go() before visitation starts
sb = new StringBuilder();
line = new StringBuilder();
extLinkNum = 1;
pastBod = false;
needNewlines = 0;
needSpace = false;
noWrap = false;
sections = new LinkedList<Integer>();
return super.before(node);
}
@Override
protected Object after(AstNode node, Object result)
{
finishLine();
// This method is called by go() after visitation has finished
// The return value will be passed to go() which passes it to the caller
return sb.toString();
}
// =========================================================================
public void visit(AstNode n)
{
// Fallback for all nodes that are not explicitly handled below
write("<");
write(n.getNodeName());
write(" />");
}
public void visit(NodeList n)
{
iterate(n);
}
public void visit(Itemization e)
{
iterate(e.getContent());
}
public void visit(ItemizationItem i)
{
newline(1);
iterate(i.getContent());
}
public void visit(Enumeration e)
{
iterate(e.getContent());
}
public void visit(EnumerationItem item)
{
newline(1);
iterate(item.getContent());
}
public void visit(Page p)
{
iterate(p.getContent());
}
public void visit(Text text)
{
write(text.getContent());
}
public void visit(Whitespace w)
{
write(" ");
}
public void visit(Bold b)
{
//write("**");
iterate(b.getContent());
//write("**");
}
public void visit(Italics i)
{
//write("//");
iterate(i.getContent());
//write("//");
}
public void visit(XmlCharRef cr)
{
write(Character.toChars(cr.getCodePoint()));
}
public void visit(XmlEntityRef er)
{
String ch = EntityReferences.resolve(er.getName());
if ("nbsp".equals(er.getName()))
{
write(' ');
}
else if (ch == null)
{
write('&');
write(er.getName());
write(';');
}
else
{
write(ch);
}
}
public void visit(Url url)
{
write(url.getProtocol());
write(':');
write(url.getPath());
}
public void visit(ExternalLink link)
{
/*write('[');
write(extLinkNum++);
write(']');*/
}
public void visit(InternalLink link)
{
try
{
PageTitle page = PageTitle.make(config, link.getTarget());
if (page.getNamespace().equals(config.getNamespace("Category")))
return;
}
catch (LinkTargetException e)
{
}
write(link.getPrefix());
if (link.getTitle().getContent() == null
|| link.getTitle().getContent().isEmpty())
{
write(link.getTarget());
}
else
{
iterate(link.getTitle());
}
write(link.getPostfix());
}
public void visit(Section s)
{
finishLine();
StringBuilder saveSb = sb;
boolean saveNoWrap = noWrap;
sb = new StringBuilder();
noWrap = true;
iterate(s.getTitle());
finishLine();
String title = sb.toString().trim();
sb = saveSb;
if (s.getLevel() >= 1)
{
while (sections.size() > s.getLevel())
sections.removeLast();
while (sections.size() < s.getLevel())
sections.add(1);
StringBuilder sb2 = new StringBuilder();
for (int i = 0; i < sections.size(); ++i)
{
if (i < 1)
continue;
sb2.append(sections.get(i));
sb2.append('.');
}
if (sb2.length() > 0)
sb2.append(' ');
sb2.append(title);
title = sb2.toString();
}
newline(2);
write(title);
//newline(1);
//write(StringUtils.strrep('-', title.length()));
newline(2);
noWrap = saveNoWrap;
iterate(s.getBody());
while (sections.size() > s.getLevel())
sections.removeLast();
sections.add(sections.removeLast() + 1);
}
public void visit(Paragraph p)
{
iterate(p.getContent());
newline(2);
}
public void visit(HorizontalRule hr)
{
//newline(1);
//write(StringUtils.strrep('-', wrapCol));
newline(2);
}
public void visit(XmlElement e)
{
if (e.getName().equalsIgnoreCase("br"))
{
newline(1);
}
else
{
iterate(e.getBody());
}
}
// =========================================================================
// Stuff we want to hide
public void visit(ImageLink n)
{
}
public void visit(IllegalCodePoint n)
{
}
public void visit(XmlComment n)
{
}
public void visit(Template n)
{
}
public void visit(TemplateArgument n)
{
}
public void visit(TemplateParameter n)
{
}
public void visit(TagExtension n)
{
}
public void visit(MagicWord n)
{
}
// =========================================================================
private void newline(int num)
{
if (pastBod)
{
if (num > needNewlines)
needNewlines = num;
}
}
private void wantSpace()
{
if (pastBod)
needSpace = true;
}
private void finishLine()
{
sb.append(line.toString());
line.setLength(0);
}
private void writeNewlines(int num)
{
finishLine();
//sb.append(StringUtils.strrep('\n', num));
for (int i = 0; i < num; i++) {
sb.append('\n');
}
needNewlines = 0;
needSpace = false;
}
private void writeWord(String s)
{
int length = s.length();
if (length == 0)
return;
if (!noWrap && needNewlines <= 0)
{
if (needSpace)
length += 1;
if (line.length() + length >= wrapCol && line.length() > 0)
writeNewlines(1);
}
if (needSpace && needNewlines <= 0)
line.append(' ');
if (needNewlines > 0)
writeNewlines(needNewlines);
needSpace = false;
pastBod = true;
line.append(s);
}
private void write(String s)
{
if (s.isEmpty())
return;
if (Character.isSpaceChar(s.charAt(0)))
wantSpace();
String[] words = ws.split(s);
for (int i = 0; i < words.length;)
{
writeWord(words[i]);
if (++i < words.length)
wantSpace();
}
if (Character.isSpaceChar(s.charAt(s.length() - 1)))
wantSpace();
}
private void write(char[] cs)
{
write(String.valueOf(cs));
}
private void write(char ch)
{
writeWord(String.valueOf(ch));
}
private void write(int num)
{
writeWord(String.valueOf(num));
}
}