package org.wikibrain.core.model;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.wikibrain.core.WikiBrainException;
import org.wikibrain.core.lang.Language;
import org.wikibrain.core.lang.LanguageInfo;
import org.wikibrain.utils.WpStringUtils;
import java.io.*;
import java.net.URLEncoder;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Represents a canonically capitalized Wikipedia title.
* Contains utility methods for querying aspects of the title.
*
*/
public class Title implements Externalizable {
private String canonicalTitle;
private LanguageInfo language;
private static final long serialVersionUID = 3L;
private static final Logger LOG = LoggerFactory.getLogger(Title.class);
public Title(String text, LanguageInfo language) {
this(text, false, language);
}
public Title(String text, boolean isCanonical, LanguageInfo lang) {
this.canonicalTitle = isCanonical ? text : canonicalize(text, lang);
this.language = lang;
}
public Title(String title, Language language) {
this(title, LanguageInfo.getByLanguage(language));
}
public String getCanonicalTitle() {
return canonicalTitle;
}
public LanguageInfo getLanguageInfo() {
return language;
}
public Language getLanguage() {
return language.getLanguage();
}
public NameSpace getNamespace(){
Matcher m = language.getCategoryPattern().matcher(canonicalTitle);
if (m.find()) {
return NameSpace.CATEGORY;
}
String nameSpaceString = this.getNamespaceString();
if (nameSpaceString==null){
return NameSpace.ARTICLE;
} else {
return NameSpace.getNameSpaceByName(nameSpaceString);
}
}
/**
* Gets the "Category:" or equivalent
* @return
*/
public String getNamespaceString(){
return getNamespaceString(this.canonicalTitle);
}
private static String getNamespaceString(String text){
if (text.equals(":")) {
return null;
}
String[] parts = text.split(":");
if (parts != null && parts.length > 0 && text.contains(":")&& NameSpace.isNamespaceString(parts[0])){
return parts[0];
}else{
return null;
}
}
/**
* Gets the part of the title after the first colon. If there is no
* colon, returns the whole title.
* @return
*/
public String getTitleStringWithoutNamespace(){
return getTitleStringWithoutNamespace(canonicalTitle);
}
private static String getTitleStringWithoutNamespace(String text){
String[] parts = text.split(":",2);
if (parts.length == 1 || !NameSpace.isNamespaceString(parts[0])) {
return text;
} else {
return parts[1].trim();
}
}
@Override
public String toString(){
return canonicalTitle + " (" + getLanguage().getLangCode() + ")";
}
@Override
public int hashCode(){
return canonicalTitle.hashCode();
}
@Override
public boolean equals(Object o){
if (o instanceof Title){
return o.toString().equals(this.toString());
}else{
return false;
}
}
/**
* Needs langId because Title does not store it for memory reasons
* @return
*/
public Title toUpperCase(){
String upTitle = this.toString().toUpperCase();
return new Title(upTitle, true, language);
}
public String toUrl() throws WikiBrainException{
try {
return "http://" + language.getLanguage().getDomain() + "/wiki/" + URLEncoder.encode(getCanonicalTitle().replaceAll(" ", "_"), "UTF-8");
} catch (UnsupportedEncodingException e) {
throw new WikiBrainException(e);
}
}
public long longHashCode() {
return longHashCode(language.getLanguage(), getCanonicalTitle(), getNamespace());
}
public static long longHashCode(Language l, String title, NameSpace ns) {
return longHashCode(l.getId(), title, ns.getArbitraryId());
}
public static long longHashCode(int langId, String title, int nsArbitraryId) {
return WpStringUtils.longHashCode(langId + "." + nsArbitraryId + "." + title);
}
/**
* Deconstructs a title such as "Mash_(film)" into {"Mash", "film"}
*/
private static Pattern nameAndDisambiguatorPattern = Pattern.compile("(.+?)\\s*\\((.+?)\\)");
public String[] getNameAndDisambiguator(){
String s = this.toString();
Matcher m = nameAndDisambiguatorPattern.matcher(s);
String disam = null;
String name = this.toString();
if (m.find()){
name = m.group(1);
disam = m.group(2);
}
return new String[] {name, disam};
}
private static final Pattern COLON_PATTERN = Pattern.compile("\\s+:\\s+");
/**
* Converts a title into its canonical Wikipedia representation.
* This may be imperfect, but it's a good guess.
* @param title
* @param lang
* @return
*/
public static String canonicalize(String title, LanguageInfo lang) {
// spaces and underscores are equivalent
title = title.replaceAll("_", " ");
// remove the fragment
int i = title.indexOf("#");
if (i >= 0) {
title = title.substring(0, i);
}
//removes leading and trailing spaces
title = title.trim();
// This code enforces the "first letter always caps, everything else
// is case-sensitive policy" of Wikipedia. I modified this code from JWPL 0.9.1
// just to be safe, but I think it's identical to my old code.
title = StringUtils.capitalize(title);
// handle whitespace around colons (only a problem for categories in this context)
Matcher m = COLON_PATTERN.matcher(title);
if (m.find()){
title = m.replaceFirst(":");
}
// normalize all the category aliases
m = lang.getCategoryReplacePattern().matcher(title);
if (m.find() && m.group(1).equals(lang.getDefaultCategoryNamespaceName())) {
title = lang.getDefaultCategoryNamespaceName() + ":" + m.group(2);
}
// ensure reasonable capitalization with namespaces
if (getNamespaceString(title) != null){
// make sure that titles following colons conform to first-letter-caps policy
title = getNamespaceString(title) + ":" + StringUtils.capitalize(getTitleStringWithoutNamespace(title));
}
// this is a weird f-ing bug that needed to be fixed!
title = title.replaceAll("\u200E", "");
return title;
}
@Override
public void readExternal(ObjectInput in) throws IOException,
ClassNotFoundException {
this.canonicalTitle = in.readUTF();
}
@Override
public void writeExternal(ObjectOutput out) throws IOException {
out.writeUTF(canonicalTitle);
}
}