package it.cnr.isti.hpc.erd; import it.cnr.isti.hpc.io.reader.RecordParser; import java.util.Scanner; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Copyright 2014 Diego Ceccarelli * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /** * @author Diego Ceccarelli <diego.ceccarelli@isti.cnr.it> * * Created on Mar 15, 2014 */ public class WikipediaLabelToFreebaseRecord { private static final Logger logger = LoggerFactory .getLogger(WikipediaLabelToFreebaseRecord.class); private String freebaseId; private String label; private String wikipediaLabel; public WikipediaLabelToFreebaseRecord() { super(); } public String getCleanWikipediaLabel() { return convert(wikipediaLabel).replaceAll("/wikipedia/en_title/", ""); } public String getFreebaseId() { return freebaseId; } public void setFreebaseId(String freebaseId) { this.freebaseId = freebaseId; } public String getLabel() { return label; } public void setLabel(String label) { this.label = label; } public String getWikipediaLabel() { return wikipediaLabel; } public void setWikipediaLabel(String wikipediaLabel) { this.wikipediaLabel = wikipediaLabel; } final static private Pattern quotedCharPattern = Pattern .compile("\\$([0-9A-Fa-f]{4})"); protected String convert(String s) { // see // https://github.com/OpenRefine/OpenRefine/blob/master/extensions/freebase/src/com/google/refine/freebase/expr/MqlKeyUnquote.java StringBuffer sb = new StringBuffer(); int last = 0; Matcher m = quotedCharPattern.matcher(s); while (m.find()) { int start = m.start(); int end = m.end(); if (start > last) { sb.append(s.substring(last, start)); } last = end; sb.append((char) Integer.parseInt(s.substring(start + 1, end), 16)); } if (last < s.length()) { sb.append(s.substring(last)); } return sb.toString(); } public static WikipediaLabelToFreebaseRecord parse(String tsvString) { Scanner scanner = new Scanner(tsvString).useDelimiter("\t"); WikipediaLabelToFreebaseRecord record = new WikipediaLabelToFreebaseRecord(); record.setFreebaseId(scanner.next()); String label = scanner.next(); // removes the first " and the final "@en if (!label.isEmpty()) { label = label.substring(1, label.length() - 4); } else { // empty label logger.warn("not label for line \n {}", tsvString); } record.setLabel(label); String wikilabel = scanner.next(); wikilabel = wikilabel.substring(1, wikilabel.length() - 1); record.setWikipediaLabel(wikilabel); return record; } protected String decode(String utf8endodedLabel) { return convert(utf8endodedLabel); } public static class Parser implements RecordParser<WikipediaLabelToFreebaseRecord> { public WikipediaLabelToFreebaseRecord decode(String record) { return WikipediaLabelToFreebaseRecord.parse(record); } public String encode(WikipediaLabelToFreebaseRecord obj) { // TODO Auto-generated method stub return null; } } }