package org.wikibrain.wikidata; import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; import com.google.gson.*; import org.wikibrain.core.lang.Language; import org.wikibrain.core.lang.LanguageSet; import org.wikibrain.parser.WpParseException; import org.wikidata.wdtk.datamodel.helpers.DatamodelConverter; import org.wikidata.wdtk.datamodel.helpers.ToString; import org.wikidata.wdtk.datamodel.interfaces.*; import org.wikidata.wdtk.datamodel.json.jackson.*; import org.wikidata.wdtk.datamodel.json.jackson.datavalues.JacksonValue; import java.io.BufferedWriter; import java.io.IOException; import java.util.*; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * @author Shilad Sen */ public class WikidataParser { private static final Logger LOG = LoggerFactory.getLogger(WikidataParser.class); private final LanguageSet langs; final ObjectMapper mapper = new ObjectMapper(); private final DatamodelConverter datamodelConverter; public WikidataParser() { this(LanguageSet.ALL); } public WikidataParser(LanguageSet langs) { this.datamodelConverter = new DatamodelConverter( new JacksonObjectFactory()); this.langs = langs; } private static BufferedWriter writer; public WikidataEntity parse(String json) throws WpParseException { JacksonTermedStatementDocument mwDoc; try { mwDoc = mapper.readValue(json, JacksonTermedStatementDocument.class); } catch (IOException e) { LOG.info("Error parsing: " + json); throw new WpParseException(e); } WikidataEntity record = new WikidataEntity(mwDoc.getEntityId().getId()); // Aliases (multiple per language) for (List<MonolingualTextValue> vlist : mwDoc.getAliases().values()) { if (vlist.isEmpty()) continue; if (!validLanguage(vlist.get(0).getLanguageCode())) continue; Language lang = Language.getByLangCodeLenient(vlist.get(0).getLanguageCode()); record.getAliases().put(lang, new ArrayList<String>()); for (MonolingualTextValue v : vlist) { record.getAliases().get(lang).add(v.getText()); } } // Descriptions (one per language) for (MonolingualTextValue v : mwDoc.getDescriptions().values()) { if (validLanguage(v.getLanguageCode())) { Language lang = Language.getByLangCodeLenient(v.getLanguageCode()); record.getDescriptions().put(lang, v.getText()); } } // Labels (one per language) for (MonolingualTextValue v : mwDoc.getLabels().values()) { if (validLanguage(v.getLanguageCode())) { Language lang = Language.getByLangCodeLenient(v.getLanguageCode()); record.getLabels().put(lang, v.getText()); } } // Claims (only for Item entities) if (mwDoc instanceof JacksonItemDocument) { for (List<JacksonStatement> statements : ((JacksonItemDocument)mwDoc).getJsonClaims().values()) { for (JacksonStatement s : statements) { record.getStatements().add(parseOneClaim(record, s)); } } } return record; } private WikidataStatement parseOneClaim(WikidataEntity item, JacksonStatement js) throws WpParseException { String propId =js.getMainsnak().getProperty(); // e.g. "P34" WikidataEntity prop = new WikidataEntity(WikidataEntity.Type.PROPERTY, Integer.valueOf(propId.substring(1))); String valTypeStr = js.getMainsnak().accept(new SnakVisitor<String>() { @Override public String visit(ValueSnak snak) { return "value"; } @Override public String visit(SomeValueSnak snak) { return "somevalue"; } @Override public String visit(NoValueSnak snak) { return "novalue"; } }); JsonElement jsonVal = null; WikidataValue value; if (valTypeStr.equals("value")) { // more specific type available JacksonValueSnak snak = (JacksonValueSnak)js.getMainsnak(); valTypeStr = ((JacksonValue)snak.getValue()).getType(); value = snakToValue(valTypeStr, snak.getValue()); } else { value = jsonToValue(valTypeStr, jsonVal); } WikidataStatement.Rank rank; if (js.getRank() == null) { rank = null; } else if (js.getRank() == StatementRank.PREFERRED) { rank = WikidataStatement.Rank.PREFERRED; } else if (js.getRank() == StatementRank.NORMAL) { rank = WikidataStatement.Rank.NORMAL; } else if (js.getRank() == StatementRank.DEPRECATED) { rank = WikidataStatement.Rank.DEPRECATED; } else { throw new WpParseException("unknown rank: " + js.getRank() + " in " + js); } String uuid = js.getStatementId(); return new WikidataStatement(uuid, item, prop, value, rank); } public WikidataValue snakToValue(final String type, Value snak) { String jsonStr = null; try { jsonStr = mapper.writeValueAsString(snak); } catch (JsonProcessingException e) { throw new IllegalArgumentException("Unknown snak: " + snak); } final JsonElement element = new JsonParser().parse(jsonStr); final JsonElement jsonValue = (element.isJsonObject() && element.getAsJsonObject().has("value")) ? element.getAsJsonObject().get("value") : null; WikidataValue val = snak.accept(new ValueVisitor<WikidataValue>() { @Override public WikidataValue visit(EntityIdValue value) { if (value.getEntityType().equals(EntityIdValue.ET_ITEM)) { return WikidataValue.forItem(Integer.valueOf(value.getId().substring(1))); } else if (value.getEntityType().equals(EntityIdValue.ET_PROPERTY)) { throw new IllegalArgumentException("Did not expect entity property"); } else { throw new IllegalArgumentException("Unknown entity type: " + value.getEntityType()); } } @Override public WikidataValue visit(GlobeCoordinatesValue value) { return new WikidataValue(type, gsonToPrimitive(jsonValue), jsonValue); } @Override public WikidataValue visit(QuantityValue value) { return new WikidataValue(type, gsonToPrimitive(jsonValue), jsonValue); } @Override public WikidataValue visit(StringValue value) { return WikidataValue.forString(value.getString()); } @Override public WikidataValue visit(TimeValue value) { Calendar c = new GregorianCalendar( ((int)value.getYear()), value.getMonth()-1, value.getDay(), value.getHour(), value.getMinute(), value.getSecond()); return new WikidataValue( WikidataValue.Type.TIME, c.getTime(), jsonValue ); } @Override public WikidataValue visit(MonolingualTextValue value) { return WikidataValue.forString(value.getText()); } @Override public WikidataValue visit(DatatypeIdValue value) { throw new IllegalArgumentException(); } }); return val; } public WikidataValue jsonToValue(String type, JsonElement element) throws WpParseException { if (type.equals("somevalue")) { return new WikidataValue(WikidataValue.Type.SOMEVALUE, null, JsonNull.INSTANCE); } else if (type.equals("novalue")) { return new WikidataValue(WikidataValue.Type.NOVALUE, null, JsonNull.INSTANCE); } else if (type.equals("item") || type.equals("property")) { type = "wikibase-entityid"; } String fullJson = "{ \"type\" : \"" + type + "\", \"value\" : " + element.toString() + " }"; try { Value snak = mapper.readValue(fullJson, JacksonValue.class); return snakToValue(type, snak); } catch (IOException e) { throw new WpParseException(e); } } public static Object gsonToPrimitive(JsonElement element) { if (element.isJsonPrimitive()) { JsonPrimitive prim = element.getAsJsonPrimitive(); if (prim.isString()) { return prim.getAsString(); } else if (prim.isBoolean()) { return prim.getAsBoolean(); } else if (prim.isNumber()) { return prim.getAsNumber(); } else { throw new IllegalArgumentException("Unknown Gson primitive: " + prim); } } else if (element.isJsonArray()) { JsonArray array = element.getAsJsonArray(); List<Object> list = new ArrayList<Object>(); for (int i = 0; i < array.size(); i++) { list.add(gsonToPrimitive(array.get(i))); } return list; } else if (element.isJsonNull()) { return null; } else if (element.isJsonObject()) { Map<String, Object> map = new HashMap<String, Object>(); for (Map.Entry<String, JsonElement> entry : element.getAsJsonObject().entrySet()) { map.put(entry.getKey(), gsonToPrimitive(entry.getValue())); } return map; } else { throw new IllegalArgumentException("Unknown Gson value: " + element); } } private boolean validLanguage(String langCode) { return Language.hasLangCode(langCode) && langs.containsLanguage(langCode); } }