/** * KOSHIK is an NLP framework for large scale processing using Hadoop. * Copyright © 2014 Peter Exner * * This file is part of KOSHIK. * * KOSHIK is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * KOSHIK is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with KOSHIK. If not, see <http://www.gnu.org/licenses/>. */ package se.lth.cs.koshik.analysis.wikipedia; import org.sweble.wikitext.engine.PageTitle; import org.sweble.wikitext.engine.config.WikiConfig; import org.sweble.wikitext.engine.nodes.EngPage; import org.sweble.wikitext.parser.nodes.WtBold; import org.sweble.wikitext.parser.nodes.WtExternalLink; import org.sweble.wikitext.parser.nodes.WtHorizontalRule; import org.sweble.wikitext.parser.nodes.WtIllegalCodePoint; import org.sweble.wikitext.parser.nodes.WtImageLink; import org.sweble.wikitext.parser.nodes.WtInternalLink; import org.sweble.wikitext.parser.nodes.WtItalics; import org.sweble.wikitext.parser.nodes.WtListItem; import org.sweble.wikitext.parser.nodes.WtNode; import org.sweble.wikitext.parser.nodes.WtNodeList; import org.sweble.wikitext.parser.nodes.WtOrderedList; import org.sweble.wikitext.parser.nodes.WtPageSwitch; import org.sweble.wikitext.parser.nodes.WtParagraph; import org.sweble.wikitext.parser.nodes.WtSection; import org.sweble.wikitext.parser.nodes.WtTagExtension; import org.sweble.wikitext.parser.nodes.WtTemplate; import org.sweble.wikitext.parser.nodes.WtTemplateArgument; import org.sweble.wikitext.parser.nodes.WtTemplateParameter; import org.sweble.wikitext.parser.nodes.WtText; import org.sweble.wikitext.parser.nodes.WtUnorderedList; import org.sweble.wikitext.parser.nodes.WtUrl; import org.sweble.wikitext.parser.nodes.WtWhitespace; import org.sweble.wikitext.parser.nodes.WtXmlCharRef; import org.sweble.wikitext.parser.nodes.WtXmlComment; import org.sweble.wikitext.parser.nodes.WtXmlElement; import org.sweble.wikitext.parser.nodes.WtXmlEntityRef; import org.sweble.wikitext.parser.parser.LinkTargetException; import se.lth.cs.koshik.model.Document; import se.lth.cs.koshik.model.wikipedia.InternalLink; import se.lth.cs.koshik.model.wikipedia.Section; import de.fau.cs.osr.ptk.common.AstVisitor; public class TextConverter extends AstVisitor<WtNode> { private final WikiConfig config; private StringBuilder sb; private String currentSectionTitle; private boolean filterOutput; private boolean expectSectionTitle; private boolean hasNotReadAbstract; /** * Becomes true if we are no long at the Beginning Of the whole Document. */ private Document document; // ========================================================================= public TextConverter(WikiConfig config, Document document) { this.config = config; this.document = document; } @Override protected boolean before(WtNode node) { // This method is called by go() before visitation starts sb = new StringBuilder(); currentSectionTitle = ""; expectSectionTitle = false; hasNotReadAbstract = true; return super.before(node); } @Override protected Object after(WtNode node, Object result) { this.document.setContent(sb.toString()); return sb.toString(); } // ========================================================================= public void visit(WtNode n) { } public void visit(WtNodeList n) { iterate(n); } public void visit(WtUnorderedList e) { iterate(e); } public void visit(WtOrderedList e) { iterate(e); } public void visit(WtListItem item) { iterate(item); } public void visit(EngPage p) { iterate(p); } public void visit(WtText text) { if(expectSectionTitle) { if(!text.getContent().trim().equalsIgnoreCase("")) { currentSectionTitle = text.getContent().trim(); } expectSectionTitle = false; } if(!isInsideFilteredSection() && !text.getContent().replaceAll("[\n\r]", "").equalsIgnoreCase("")) { sb.append(text.getContent()); } } public void visit(WtWhitespace w) { if(!isInsideFilteredSection()) { sb.append(" "); } } public void visit(WtBold b) { iterate(b); } public void visit(WtItalics i) { iterate(i); } public void visit(WtXmlCharRef cr) { if(!isInsideFilteredSection()) { sb.append(Character.toChars(cr.getCodePoint())); } } public void visit(WtXmlEntityRef er) { String ch = er.getResolved(); if (ch == null) { if(!isInsideFilteredSection()) { sb.append('&' + er.getName() + ';'); } } else { if(!isInsideFilteredSection()) { sb.append(ch); } } } public void visit(WtUrl wtUrl) { if (!wtUrl.getProtocol().isEmpty()) { if(!isInsideFilteredSection()) { sb.append(wtUrl.getProtocol() + ':'); } } if(!isInsideFilteredSection()) { sb.append(wtUrl.getPath()); } } public void visit(WtExternalLink link) { } public void visit(WtInternalLink link) { try { if (link.getTarget().isResolved()) { PageTitle page = PageTitle.make(config, link.getTarget().getAsString()); if (page.getNamespace().equals(config.getNamespace("Category"))) return; } } catch (LinkTargetException e) { } InternalLink internalLink = new InternalLink(document); internalLink.setBegin(sb.length()); internalLink.setTarget(link.getTarget().getAsString()); if(!isInsideFilteredSection()) { sb.append(link.getPrefix()); } if (!link.hasTitle()) { iterate(link.getTarget()); } else { iterate(link.getTitle()); } if(!isInsideFilteredSection()) { sb.append(link.getPostfix()); } internalLink.setEnd(sb.length()); } public void visit(WtSection s) { if(hasNotReadAbstract && sb.length() > 0) { Section section = new Section(document); section.setTitle("Abstract"); section.setBegin(0); section.setEnd(sb.length()); hasNotReadAbstract = false; } Section section = new Section(document); section.setBegin(sb.length()); filterOutput = true; expectSectionTitle = true; iterate(s.getHeading()); section.setTitle(currentSectionTitle); expectSectionTitle = false; filterOutput = false; if(!isInsideFilteredSection()) { if(sb.length() > 0) { if(sb.charAt(sb.length() -1) != '\n') { sb.append("\n\n"); } } } iterate(s.getBody()); section.setEnd(sb.length()); } public void visit(WtParagraph p) { iterate(p); if(!isInsideFilteredSection()) { if(sb.length() > 0) { if(sb.charAt(sb.length() -1) != '\n') { sb.append("\n\n"); } } } } public void visit(WtHorizontalRule hr) { } public void visit(WtXmlElement e) { if (e.getName().equalsIgnoreCase("br")) { } else { iterate(e.getBody()); } } // ========================================================================= // Stuff we want to hide public void visit(WtImageLink n) { } public void visit(WtIllegalCodePoint n) { } public void visit(WtXmlComment n) { } public void visit(WtTemplate n) { if(!isInsideFilteredSection()) { if(n.toString().equalsIgnoreCase("WtTemplate([0] = WtName[WtText(\"spaced ndash\")], [1] = WtTemplateArguments[])")) { sb.append(" - "); } } } public void visit(WtTemplateArgument n) { } public void visit(WtTemplateParameter n) { } public void visit(WtTagExtension n) { } public void visit(WtPageSwitch n) { } // ========================================================================= private boolean isInsideFilteredSection() { if(filterOutput) { return true; } if(this.currentSectionTitle.equalsIgnoreCase("See also") || this.currentSectionTitle.equalsIgnoreCase("Notes") || this.currentSectionTitle.equalsIgnoreCase("Writings") || this.currentSectionTitle.equalsIgnoreCase("References") || this.currentSectionTitle.equalsIgnoreCase("Publications") || this.currentSectionTitle.equalsIgnoreCase("Bibliography") || this.currentSectionTitle.equalsIgnoreCase("Further reading") || this.currentSectionTitle.equalsIgnoreCase("External links") || this.currentSectionTitle.equalsIgnoreCase("Se även") || this.currentSectionTitle.equalsIgnoreCase("Källor") || this.currentSectionTitle.equalsIgnoreCase("Externa länkar") || this.currentSectionTitle.equalsIgnoreCase("Referenser") || this.currentSectionTitle.equalsIgnoreCase("Biografi") || this.currentSectionTitle.equalsIgnoreCase("Litteratur") || this.currentSectionTitle.equalsIgnoreCase("参见") || this.currentSectionTitle.equalsIgnoreCase("相关条目") || this.currentSectionTitle.equalsIgnoreCase("注释") || this.currentSectionTitle.equalsIgnoreCase("参考资料") || this.currentSectionTitle.equalsIgnoreCase("外部链接") || this.currentSectionTitle.equalsIgnoreCase("参看")) { return true; } else { return false; } } }