package org.jsoup.nodes;
import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.integration.UrlConnectTest;
import org.jsoup.nodes.Entities;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.Map;
/**
* Fetches HTML entity names from w3.org json, and outputs data files for optimized used in Entities.
* I refuse to believe that entity names like "NotNestedLessLess" are valuable or useful for HTML authors. Implemented
* only to be complete.
*/
class BuildEntities {
private static final String projectDir = "/Users/jhy/projects/jsoup";
public static void main(String[] args) throws IOException {
String url = "https://www.w3.org/TR/2012/WD-html5-20121025/entities.json";
Connection.Response res = Jsoup.connect(url)
.ignoreContentType(true)
.userAgent(UrlConnectTest.browserUa)
.execute();
Gson gson = new Gson();
Map<String, CharacterRef> input = gson.fromJson(res.body(),
new TypeToken<Map<String, CharacterRef>>() {
}.getType());
// build name sorted base and full character lists:
ArrayList<CharacterRef> base = new ArrayList<CharacterRef>();
ArrayList<CharacterRef> full = new ArrayList<CharacterRef>();
for (Map.Entry<String, CharacterRef> entry : input.entrySet()) {
String name = entry.getKey().substring(1); // name is like ´ or ´ , trim &
CharacterRef ref = entry.getValue();
if (name.endsWith(";")) {
name = name.substring(0, name.length() - 1);
full.add(ref);
} else {
base.add(ref);
}
ref.name = name;
}
Collections.sort(base, byName);
Collections.sort(full, byName);
// now determine code point order
ArrayList<CharacterRef> baseByCode = new ArrayList<CharacterRef>(base);
ArrayList<CharacterRef> fullByCode = new ArrayList<CharacterRef>(full);
Collections.sort(baseByCode, byCode);
Collections.sort(fullByCode, byCode);
// and update their codepoint index. Don't
ArrayList<CharacterRef>[] codelists = new ArrayList[]{baseByCode, fullByCode};
for (ArrayList<CharacterRef> codelist : codelists) {
for (int i = 0; i < codelist.size(); i++) {
codelist.get(i).codeIndex = i;
}
}
// now write them
persist("entities-full.properties", full);
persist("entities-base.properties", base);
System.out.println("Full size: " + full.size() + ", base size: " + base.size());
}
private static void persist(String name, ArrayList<CharacterRef> refs) throws IOException {
String base = projectDir + "/src/main/java/org/jsoup/nodes";
File file = new File(base, name);
FileWriter writer = new FileWriter(file, false);
for (CharacterRef ref : refs) {
writer.append(ref.toString()).append("\n");
}
writer.close();
}
private static class CharacterRef {
int[] codepoints;
String name;
int codeIndex;
@Override
public String toString() {
return name
+ "="
+ d(codepoints[0])
+ (codepoints.length > 1 ? "," + d(codepoints[1]) : "")
+ ";" + d(codeIndex);
}
}
private static String d(int d) {
return Integer.toString(d, Entities.codepointRadix);
}
private static class ByName implements Comparator<CharacterRef> {
public int compare(CharacterRef o1, CharacterRef o2) {
return o1.name.compareTo(o2.name);
}
}
private static class ByCode implements Comparator<CharacterRef> {
public int compare(CharacterRef o1, CharacterRef o2) {
int[] c1 = o1.codepoints;
int[] c2 = o2.codepoints;
int first = c1[0] - c2[0];
if (first != 0)
return first;
if (c1.length == 1 && c2.length == 1) { // for the same code, use the shorter name
int len = o2.name.length() - o1.name.length();
if (len != 0)
return len;
return o1.name.compareTo(o2.name);
}
if (c1.length == 2 && c2.length == 2)
return c1[1] - c2[1];
else
return c2.length - c1.length; // pushes multi down the list so hits on singles first (don't support multi lookup by codepoint yet)
}
}
private static ByName byName = new ByName();
private static ByCode byCode = new ByCode();
}