package org.wikipedia.miner.extract.util; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.InputStream; import java.util.HashMap; import java.util.List; import java.util.Map; import org.simpleframework.xml.Attribute; import org.simpleframework.xml.Element; import org.simpleframework.xml.ElementList; import org.simpleframework.xml.Root; import org.simpleframework.xml.Serializer; import org.simpleframework.xml.Text; import org.simpleframework.xml.Transient; import org.simpleframework.xml.core.Persister; @Root public class SiteInfo { public static final int MAIN_KEY = 0 ; public static final int SPECIAL_KEY = -1 ; public static final int FILE_KEY = 6 ; public static final int TEMPLATE_KEY = 10 ; public static final int CATEGORY_KEY = 14 ; @Element(name="sitename") private String siteName ; @Element private String base ; @Element private String generator ; @Element(name="case") private String caseRule ; @ElementList(name="namespaces",entry="namespace") private List<Namespace> namespaces ; @Transient private Map<String, Namespace> namespacesByName ; @Transient private Map<Integer, Namespace> namespacesByKey ; public static SiteInfo load(File file) throws Exception { Serializer serializer = new Persister(); return serializer.read(SiteInfo.class, file); } public static SiteInfo load(InputStream input) throws Exception { Serializer serializer = new Persister(); return serializer.read(SiteInfo.class, input) ; } public static SiteInfo loadFromDump(File file) throws Exception { final int maxBeforeLines = 100 ; final int maxDuringLines = 100 ; StringBuffer sb = new StringBuffer() ; BufferedReader reader = new BufferedReader(new FileReader(file)) ; boolean started = false ; String line ; int beforeLineCount = 0 ; int duringLineCount = 0 ; while ((line=reader.readLine()) != null) { if (line.contains("<siteinfo>")) started = true ; if (started) { duringLineCount++ ; sb.append(line + "\n") ; } else { beforeLineCount++ ; } if (line.contains("</siteinfo>")) break ; if (beforeLineCount > maxBeforeLines) break ; if (duringLineCount > maxDuringLines) break ; } reader.close() ; if (beforeLineCount > maxBeforeLines) throw new Exception("Could not detect start of site info element") ; if (duringLineCount > maxDuringLines) throw new Exception("Could not detect end of site info element") ; Serializer serializer = new Persister(); return serializer.read(SiteInfo.class, sb.toString()) ; } public String getSiteName() { return siteName; } public String getBase() { return base; } public String getCaseRule() { return caseRule; } public String getGenerator() { return generator; } public List<Namespace> getNamespaces() { return namespaces ; } public Namespace getNamespace(String name) { return getNamespacesByName().get(name.toLowerCase().trim()) ; } public Namespace getNamespace(int key) { return getNamespacesByKey().get(key) ; } public Namespace getMainNamespace() { return getNamespacesByKey().get(MAIN_KEY) ; } private Map<String,Namespace> getNamespacesByName() { if (namespacesByName != null) return namespacesByName ; namespacesByName = new HashMap<String,Namespace>() ; for (Namespace namespace:namespaces) namespacesByName.put(namespace.getName().toLowerCase(), namespace) ; return namespacesByName ; } private Map<Integer,Namespace> getNamespacesByKey() { if (namespacesByKey != null) return namespacesByKey ; namespacesByKey = new HashMap<Integer,Namespace>() ; for (Namespace namespace:namespaces) namespacesByKey.put(namespace.getKey(), namespace) ; return namespacesByKey ; } public static class Namespace { @Attribute private int key ; @Attribute(name="case") private String caseRule ; @Text(required=false) private String name ; public int getKey() { return key; } public String getCaseRule() { return caseRule; } public String getName() { if (name == null) return "" ; else return name; } } }