package org.bedework.tools;
import org.bedework.util.args.Args;
import java.io.File;
import java.io.FileReader;
import java.io.LineNumberReader;
import java.io.Reader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import javax.xml.namespace.QName;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.events.Attribute;
import javax.xml.stream.events.EndElement;
import javax.xml.stream.events.StartElement;
import javax.xml.stream.events.XMLEvent;
public class DmozStructure {
String inFileName;
String outDirName;
boolean create;
static class Topic {
String name; // canonical
String path;
Map<String, String> displayNames = new HashMap<String, String>();
Map<String, String> descriptions = new HashMap<String, String>();
Set<String> narrows = new TreeSet<String>();
Set<String> narrow1s = new TreeSet<String>();
Set<String> narrow2s = new TreeSet<String>();
}
static class LangRef {
String lang;
Topic topic;
LangRef(String lang, Topic topic) {
this.lang = lang;
this.topic =topic;
}
}
private Map<String, Topic> topics = new HashMap<String, Topic>();
private Map<String, LangRef> altLangs = new HashMap<String, LangRef>();
private QName skippingElement;
private Topic curTopic;
private StringBuilder curText;
private String curLang;
private boolean doingAltLang;
private final static String defLang = "DEF";
private static String dmozns = "http://dmoz.org/rdf/";
private static String w3ns = "http://www.w3.org/TR/RDF/";
private static String purlns = "http://purl.org/dc/elements/1.0/";
private static QName qnTopic = new QName(dmozns, "Topic");
private static QName qnTopicIdAttr = new QName(w3ns, "id");
private static QName qnDescription = new QName(purlns, "Description");
private static QName qnTitle = new QName(purlns, "Title");
private static QName qnAltlang = new QName(dmozns, "altlang");
private static QName qnAltlang1 = new QName(dmozns, "altlang1");
private static QName qnNarrow = new QName(dmozns, "narrow");
private static QName qnNarrow1 = new QName(dmozns, "narrow1");
private static QName qnNarrow2 = new QName(dmozns, "narrow2");
private static QName qnResource = new QName(w3ns, "resource");
/* Keep if id starts with one of these */
private static List<String> keepList = new ArrayList<String>();
/* Skip if id starts with one of these */
private static List<String> skipList = new ArrayList<String>();
/* Skip if id ends with one of these */
private static List<String> skipEndList = new ArrayList<String>();
/* Skip if id starts with one of these + a single path element and "/" */
private static List<String> skip1elementFollowingList = new ArrayList<String>();
/* Skip if id ends with one of these + a single character path element */
private static List<String> skip1charList = new ArrayList<String>();
/* Skip if id starts with one of these + a single character path element + "/" */
private static List<String> skip1charFollowingList = new ArrayList<String>();
/* Elements we skip */
private static Map<QName, String> skipEls = new HashMap<QName, String>();
static {
skipEls.put(new QName(dmozns, "Alias"), "skip");
skipEls.put(new QName(dmozns, "catid"), "skip");
skipEls.put(new QName(dmozns, "editor"), "skip");
skipEls.put(new QName(dmozns, "lastUpdate"), "skip");
skipEls.put(new QName(dmozns, "letterbar"), "skip");
skipEls.put(new QName(dmozns, "newsgroup"), "skip");
skipEls.put(new QName(dmozns, "related"), "skip");
skipEls.put(new QName(dmozns, "symbolic"), "skip");
skipEls.put(new QName(dmozns, "symbolic1"), "skip");
skipEls.put(new QName(dmozns, "symbolic2"), "skip");
}
/* Elements with no end processing */
private static Map<QName, String> noEndEls = new HashMap<QName, String>();
static {
noEndEls.put(qnNarrow, "noend");
noEndEls.put(qnNarrow1, "noend");
noEndEls.put(qnNarrow2, "noend");
noEndEls.put(qnAltlang, "noend");
noEndEls.put(qnAltlang1, "noend");
}
private Map<String, String> langs = new HashMap<String, String>();
boolean process() throws Throwable {
File in = new File(inFileName);
File out = null;
if (create) {
out = new File(outDirName);
if (!out.isDirectory()) {
error("Not a directory: " + outDirName);
return false;
}
}
if (!in.isFile()) {
error("Not a file: " + inFileName);
return false;
}
if (!processInFile(in)) {
error("Failed?");
}
info("topics: " + topics.keySet().size());
info("altLangs: " + altLangs.keySet().size());
/*
Set<String> sorted = new TreeSet(langs.keySet());
info("Languages:");
for (String s: sorted) {
info(s + ": " + langs.get(s));
}
*/
if (!create) {
return true;
}
Set<String> sortedTopics = new TreeSet<String>(topics.keySet());
if (!outDirName.endsWith("/")) {
outDirName += "/";
}
info("Create directory structure under " + outDirName);
for (String s: sortedTopics) {
String path = outDirName + s;
File dir = new File(path);
dir.mkdir();
}
return true;
}
boolean processInFile(File f) throws Throwable {
XMLInputFactory factory = XMLInputFactory.newInstance();
Reader fileReader = new FileReader(f);
XMLEventReader reader = factory.createXMLEventReader(fileReader);
while (reader.hasNext()) {
XMLEvent event = reader.nextEvent();
if (skippingElement != null) {
if (event.isEndElement()) {
QName elname = event.asEndElement().getName();
if (elname.equals(skippingElement)) {
skippingElement = null;
}
}
continue;
}
if (event.isStartElement()) {
processStartElement(reader, event.asStartElement());
continue;
}
if (event.isEndElement()) {
processEndElement(reader, event.asEndElement());
continue;
}
if (event.isCharacters()) {
String s = event.asCharacters().getData();
if (curText != null) {
curText.append(s);
continue;
}
if (s.trim().length() == 0) {
continue;
}
info("Text: " + s);
continue;
}
}
return true;
}
void processStartElement(XMLEventReader reader,
StartElement element) {
QName elname = element.getName();
if (skipEls.containsKey(elname)) {
skippingElement = elname;
return;
}
if (elname.equals(qnTopic)) {
processTopicStart(element);
return;
}
if (elname.equals(qnAltlang)) {
processAltLang(element);
return;
}
if (elname.equals(qnAltlang1)) {
processAltLang(element);
return;
}
if (elname.equals(qnDescription)) {
curText = new StringBuilder();
return;
}
if (elname.equals(qnTitle)) {
curText = new StringBuilder();
return;
}
if (elname.equals(qnNarrow)) {
if (curTopic != null) {
curTopic.narrows.add(getResource(element));
}
return;
}
if (elname.equals(qnNarrow1)) {
if (curTopic != null) {
curTopic.narrow1s.add(getResource(element));
}
return;
}
if (elname.equals(qnNarrow2)) {
if (curTopic != null) {
curTopic.narrow2s.add(getResource(element));
}
return;
}
info("Start Element: " + elname);
Iterator iterator = element.getAttributes();
while (iterator.hasNext()) {
Attribute attribute = (Attribute) iterator.next();
QName name = attribute.getName();
String value = attribute.getValue();
System.out.println("Attribute name/value: " + name + "/" + value);
}
}
void processEndElement(XMLEventReader reader,
EndElement element) {
QName elname = element.getName();
if (skipEls.containsKey(elname) || noEndEls.containsKey(elname)) {
return;
}
if (elname.equals(qnTopic)) {
curTopic = null;
doingAltLang = false;
return;
}
if (elname.equals(qnDescription)) {
if ((curText != null) && (curTopic != null)) {
if (curLang == null) {
curTopic.descriptions.put(defLang, curText.toString());
} else {
curTopic.descriptions.put(curLang, curText.toString());
}
}
curText = null;
return;
}
if (elname.equals(qnTitle)) {
String s = curText.toString();
if ((curText != null) && (curTopic != null)) {
curTopic.name = s;
}
if (curLang == null) {
curTopic.displayNames.put(defLang, s);
} else {
curTopic.displayNames.put(curLang, s);
}
curText = null;
return;
}
System.out.println("End element:" + element.getName());
}
void processTopicStart(StartElement element) {
String id = getAttr(element, qnTopicIdAttr);
if (topics.get(id) != null) {
error("Duplicate topic " + id);
}
id = fixTopicName(id);
if (id == null) {
skippingElement = qnTopic;
return;
}
if (topics.get(id) != null) {
// Probably fine
skippingElement = qnTopic;
return;
}
LangRef lr = altLangs.get(id);
if (lr != null) {
/* This is a topic referenced as an altlang. Just switch to that as the
* topic but set the curLang
*/
curTopic = lr.topic;
curLang = lr.lang;
doingAltLang = true;
return;
}
if (!checkSkips(id)) {
skippingElement = qnTopic;
return;
}
curTopic = new Topic();
curTopic.path = id;
topics.put(id, curTopic);
info(id);
}
boolean checkSkips(String id) {
for (String s: keepList) {
if (id.equals(s)) {
return true;
}
}
for (String s: skipEndList) {
if (id.endsWith(s)) {
return false;
}
}
for (String s: skip1charList) {
if (id.startsWith(s)) {
for (int i = 0; i < skipSuffices.length(); i++) {
String sfx = skipSuffices.substring(i, i + 1);
String s1 = s + sfx;
if (id.equals(s1)) {
return false;
}
}
}
}
for (String s: skip1elementFollowingList) {
if (id.startsWith(s)) {
if (id.indexOf("/", s.length()) > 0) {
return false;
}
}
}
for (String s: skip1charFollowingList) {
if (id.startsWith(s)) {
for (int i = 0; i < skipSuffices.length(); i++) {
String sfx = skipSuffices.substring(i, i + 1);
String s1 = s + sfx;
if (id.equals(s1) || (id.startsWith(s1 + "/"))) {
return false;
}
}
}
}
for (String s: skipList) {
if (s.endsWith("/")) {
if (id.startsWith(s)) {
return false;
}
} else {
if (id.equals(s) || (id.startsWith(s + "/"))) {
return false;
}
}
}
return true;
}
String skipSuffices = "ABCDEFGHIJKLMNOPQRSTUVWXYZ01234567890";
void processAltLang(StartElement element) {
if (curTopic == null) {
return;
}
String id = getResource(element);
int pos = id.indexOf(':');
if (pos < 0) {
error("No ':' in " + id);
return;
}
String lang = id.substring(0, pos);
id = fixTopicName(id.substring(pos + 1));
altLangs.put(id, new LangRef(lang, curTopic));
if (!langs.containsKey(lang)) {
langs.put(lang, null);
}
}
String fixTopicName(String val) {
int pos = val.lastIndexOf('/');
if (pos < 0) {
return val;
}
if (val.length() - pos == 2) {
// Don't want 1 char element
return null;
}
return val.replaceFirst("/\\p{Upper}/", "/");
}
String getAttr(StartElement element, QName qn) {
Attribute attr = element.getAttributeByName(qn);
return attr.getValue();
}
String getResource(StartElement element) {
return getAttr(element, qnResource);
}
void processSkips(File f) throws Throwable {
FileReader fr = new FileReader(f);
LineNumberReader lnr = new LineNumberReader(fr);
for (;;) {
String ln = lnr.readLine();
if (ln == null) {
break;
}
ln = ln.trim();
if ((ln.length() == 0) || ln.startsWith("#")) {
continue;
}
if (ln.startsWith("+")) {
ln = ln.substring(1);
keepList.add(ln);
continue;
}
if (ln.startsWith("//*//")) {
ln = ln.substring(4); // Leave one slash
skipEndList.add(ln);
continue;
}
if (ln.endsWith("?/")) {
ln = ln.substring(0, ln.length() - 2);
skip1elementFollowingList.add(ln);
continue;
}
if (ln.endsWith("*1*")) {
ln = ln.substring(0, ln.length() - 3);
skip1charList.add(ln);
continue;
}
if (ln.endsWith("*1*/")) {
ln = ln.substring(0, ln.length() - 4);
skip1charFollowingList.add(ln);
continue;
}
skipList.add(ln);
}
}
boolean processArgs(final Args args) throws Throwable {
if (args == null) {
return true;
}
while (args.more()) {
if (args.ifMatch("")) {
continue;
}
if (args.ifMatch("-i")) {
inFileName = args.next();
} else if (args.ifMatch("-o")) {
outDirName = args.next();
} else if (args.ifMatch("-c")) {
create = true;
} else if (args.ifMatch("-sf")) {
String sfName = args.next();
File sf = new File(sfName);
if (!sf.isFile()) {
error("Not a file: " + sfName);
return false;
}
processSkips(sf);
} else {
error("Illegal argument: " + args.current());
usage();
return false;
}
}
return true;
}
protected void info(final String msg) {
System.out.println(msg);
}
protected void error(final String msg) {
System.err.println(msg);
}
void usage() {
info("Usage:");
info("args -i <file>");
info(" specify dmoz file containing input");
info(" -o <dirname>");
info(" specify directory containing result");
info(" -sf <file>");
info(" specify file containing skip paths");
info(" -c");
info(" create directory structure");
info("");
}
/**
* @param args
*/
public static void main(String[] args) {
DmozStructure ds = null;
try {
ds = new DmozStructure();
if (!ds.processArgs(new Args(args))) {
return;
}
ds.process();
} catch (Throwable t) {
t.printStackTrace();
}
}
}