package net.jangaroo.tools.asdocscreenscraper; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.tidy.Configuration; import org.w3c.tidy.Tidy; import org.xml.sax.InputSource; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathConstants; import javax.xml.xpath.XPathExpression; import javax.xml.xpath.XPathExpressionException; import javax.xml.xpath.XPathFactory; import java.io.BufferedInputStream; import java.io.File; import java.io.IOException; import java.io.PrintWriter; import java.io.StringReader; import java.io.StringWriter; import java.net.URI; import java.net.URISyntaxException; import java.util.LinkedHashSet; import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * A tool to screen-scrape ASDoc XHTML and thus reverse-engineer AS3 API source code. */ public class ASDocScreenScraper { private static final Tidy TIDY; private static XPathFactory xPathFactory = XPathFactory.newInstance(); private static final Pattern RELATIVE_TYPE_URL_PATTERN = Pattern.compile("(?:\\.\\./)*(.*)\\.html(#.*)?"); private Set<String> imports = new LinkedHashSet<String>(); private static final String ADOBE_FLASH_PLATFORM_REFERENCE_BASE_URL = "http://help.adobe.com/en_US/FlashPlatform/reference/actionscript/3/"; private static final String IGNORE_UNTIL = "ignore-until:"; private static final String IGNORE_UNTIL_NEXT_AT_MODE = IGNORE_UNTIL + "next-atMode"; private static final String IGNORE_UNTIL_BR = IGNORE_UNTIL + "br"; static { TIDY = new Tidy(); TIDY.setCharEncoding(Configuration.UTF8); TIDY.setAltText(""); TIDY.setDropEmptyParas(true); TIDY.setDropFontTags(true); TIDY.setFixComments(true); TIDY.setHideEndTags(false); TIDY.setIndentAttributes(true); TIDY.setMakeClean(true); TIDY.setQuiet(true); TIDY.setQuoteAmpersand(true); TIDY.setShowWarnings(false); TIDY.setXHTML(true); TIDY.setXmlOut(true); TIDY.setXmlSpace(false); TIDY.setXmlPi(false); } public static void main(String[] args) throws IOException, ParserConfigurationException, SAXException, XPathExpressionException, TransformerException, URISyntaxException { if (args.length > 0) { new ASDocScreenScraper(ADOBE_FLASH_PLATFORM_REFERENCE_BASE_URL + args[0].replaceAll("\\.", "/") + ".html").scrape(); return; } Document classSummaryDocument = loadAndParse(new URI(ADOBE_FLASH_PLATFORM_REFERENCE_BASE_URL + "class-summary.html")); XPath xpath = xPathFactory.newXPath(); XPathExpression classNodesExpression = xpath.compile("//*[@class='summaryTable']//*[name()='tr'][not(@product)][contains(@runtime,'Flash::')]//*[name()='a']/@href"); NodeList classNodes = (NodeList)classNodesExpression.evaluate(classSummaryDocument, XPathConstants.NODESET); System.out.println("Hits: " + classNodes.getLength()); for (int i = 0; i < classNodes.getLength(); i++) { String relativeClassUrl = classNodes.item(i).getNodeValue(); if (!relativeClassUrl.endsWith("package-detail.html")) { System.out.println(relativeClassUrl); new ASDocScreenScraper(ADOBE_FLASH_PLATFORM_REFERENCE_BASE_URL + relativeClassUrl).scrape(); } } } private URI url; private Document document; private String packageName; private String className; private boolean isInterface; public ASDocScreenScraper(String url) throws URISyntaxException { this.url = new URI(url); } public void scrape() throws IOException, ParserConfigurationException, SAXException, XPathExpressionException, TransformerException { Document doc = loadAndParse(url); this.document = doc; XPath xpath = xPathFactory.newXPath(); XPathExpression packageNameExpression = xpath.compile("//*[@id='packageName']/text()"); Node packageNameNode = (Node)packageNameExpression.evaluate(doc, XPathConstants.NODE); packageName = htmlTrim(packageNameNode.getNodeValue()); if ("Top Level".equals(packageName)) { packageName = ""; } String packageDirs = "../joo/" + packageName.replaceAll("\\.", "/"); File packageDirsFile = new File(packageDirs); if (!packageDirsFile.mkdirs()) { System.out.println("[INFO] Package directory " + packageDirsFile.getAbsolutePath() + " already exists."); } className = htmlTrim(packageNameNode.getParentNode().getNextSibling().getNextSibling().getNodeValue()); File classFile = new File(packageDirsFile, className + ".as"); System.out.println("Writing file " + classFile.getCanonicalPath()); PrintWriter writer = new PrintWriter(classFile, "UTF-8"); writer.println("package " + packageName + " {"); XPathExpression classDeclarationExpression = xpath.compile("/*[name()='html']/*[name()='body']/*[name()='div']/*[name()='div'][@id='content']/*[name()='div'][1]/*[name()='div'][1]/*[name()='table'][1]/*[name()='tr'][2]/*[name()='td'][2]/text()"); Node classDeclarationNode = (Node)classDeclarationExpression.evaluate(doc, XPathConstants.NODE); String classDeclaration = classDeclarationNode.getNodeValue(); isInterface = classDeclaration.indexOf("interface") != -1; // TODO: more exact match (Pattern) needed? while (classDeclarationNode.getNextSibling() != null) { classDeclarationNode = classDeclarationNode.getNextSibling(); classDeclaration += "a".equals(classDeclarationNode.getNodeName()) ? toType(classDeclarationNode) : classDeclarationNode.getNodeValue(); } XPathExpression extendsExpression = xpath.compile("//*[@class='classHeaderTableLabel'][text()='Inheritance']/following-sibling::*/*[name()='a']"); Node extendsClassNode = (Node)extendsExpression.evaluate(doc, XPathConstants.NODE); String extendsClause = ""; if (extendsClassNode != null) { String extendsClass = toType(extendsClassNode); if (!"Object".equals(extendsClass)) { extendsClause = " extends " + extendsClass; } } XPathExpression implementsExpression = xpath.compile("//*[@class='classHeaderTableLabel'][text()='Implements']/following-sibling::*/*[name()='a']"); NodeList implementsNodes = (NodeList)implementsExpression.evaluate(doc, XPathConstants.NODESET); String implementsClause = getImplementsClause(implementsNodes); XPathExpression propertyDeclarations = xpath.compile("//*[@class='content']//*[name()='span'][not(@product)][contains(@runtime,'Flash::')] | //*[@class='content']/*[@class='MainContent']/*[name()='span'][not(@product)][not(@runtime)]"); //XPathExpression propertyDeclarations = xpath.compile("//*[@class='MainContent'][2]/*[name()='div'][@class='detailBody']"); NodeList propertyDeclarationNodes = (NodeList)propertyDeclarations.evaluate(doc, XPathConstants.NODESET); XPathExpression eventNameExpression = xPathFactory.newXPath().compile(".//*[name()='h2']/text()"); StringBuilder eventCode = new StringBuilder(); StringBuilder memberCode = new StringBuilder(); for (int i = 0; i < propertyDeclarationNodes.getLength(); i++) { Node propertyDeclarationNode = propertyDeclarationNodes.item(i); Node propertyDeclarationHeader = propertyDeclarationNode.getNextSibling(); propertyDeclarationNode = propertyDeclarationHeader.getNextSibling(); if (!(propertyDeclarationNode != null && propertyDeclarationNode instanceof Element && "detailBody".equals(((Element)propertyDeclarationNode).getAttribute("class")))) { System.out.println("[WARN] Property declaration not followed by <div class='detailBody'>."); } else { NodeList eventNodes = (NodeList)xpath.evaluate("*[name()='a']/*[name()='code']/text()", propertyDeclarationNode, XPathConstants.NODESET); Node implementationNode = (Node)xpath.evaluate("*[name()='span'][@class='label'][text()='Implementation']", propertyDeclarationNode, XPathConstants.NODE); NodeList docNodes = propertyDeclarationNode.getChildNodes(); if (eventNodes.getLength() == 2) { eventCode.append(getASDoc(docNodes, eventNodes.item(1))); String eventName = ((String)eventNameExpression.evaluate(propertyDeclarationHeader, XPathConstants.STRING)).trim(); eventCode.append("[Event(name=\"").append(eventName) .append("\", type=\"").append(eventNodes.item(0).getNodeValue()).append("\")]\n"); } else { memberCode.append(getASDoc(docNodes, null)); memberCode.append(unparseCode(implementationNode == null ? propertyDeclarationNode.getFirstChild() : implementationNode.getNextSibling())); } } } writer.println(getImports()); writer.println(eventCode); writer.print(getClassDoc()); writer.println(classDeclaration + extendsClause + implementsClause + " {"); writer.print(memberCode); writer.println("}"); writer.println("}"); writer.flush(); writer.close(); } private String getImplementsClause(NodeList implementsNodes) { StringBuilder sb = new StringBuilder(); for (int i = 0; i < implementsNodes.getLength(); i++) { sb.append(i == 0 ? " implements " : ", "); sb.append(toType(implementsNodes.item(i))); } return sb.toString(); } private static Document loadAndParse(URI url) throws TransformerException, ParserConfigurationException, SAXException, IOException { Document document = TIDY.parseDOM(new BufferedInputStream(url.toURL().openStream()), null); DOMSource domSource = new DOMSource(document.getDocumentElement()); Transformer serializer = TransformerFactory.newInstance().newTransformer(); serializer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "no"); serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); serializer.setOutputProperty(OutputKeys.DOCTYPE_PUBLIC, "-//W3C//DTD XHTML 1.0 Transitional//EN"); //serializer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"); String localXHtmlDoctype = new File(".").getAbsoluteFile().toURI().toString() + "xhtml1/DTD/xhtml1-transitional.dtd"; serializer.setOutputProperty(OutputKeys.DOCTYPE_SYSTEM, localXHtmlDoctype); StringWriter result = new StringWriter(); serializer.transform(domSource, new StreamResult(result)); String xhtmlText = result.toString(); // filter out duplicate IDs: xhtmlText = xhtmlText.replaceAll(" id=\"(pageFilter|propertyDetail)\"", ""); /* xhtmlText = xhtmlText.replaceAll(" runtime=\"[^\"]*\"", ""); xhtmlText = xhtmlText.replaceAll(" target=\"\"", " target=\"_self\""); xhtmlText = xhtmlText.replaceAll(" href=\"\"", " href=\"#\""); xhtmlText = xhtmlText.replaceAll(" nowrap=\"true\"", " nowrap=\"nowrap\""); xhtmlText = xhtmlText.replaceAll("class=\"searchFormION\"", "class=\"searchFormION\" action=\"#\""); xhtmlText = xhtmlText.replaceAll(" xmlns:xd=\"http://www.pnp-software.com/XSLTdoc\"", ""); String oldXHtmlText; do { oldXHtmlText = xhtmlText; xhtmlText = xhtmlText.replaceAll(" (id|name)=\"([^(),\"]*)(\\(|\\)|,)", " $1=\"$2_"); } while (!oldXHtmlText.equals(xhtmlText)); */ DocumentBuilderFactory domFactory = DocumentBuilderFactory.newInstance(); domFactory.setValidating(false); domFactory.setNamespaceAware(true); // never forget this! DocumentBuilder builder = domFactory.newDocumentBuilder(); return builder.parse(new InputSource(new StringReader(xhtmlText))); } private String getImports() { StringBuilder sb = new StringBuilder(); for (String importType : imports) { sb.append("import ").append(importType).append(";\n"); } return sb.toString(); } private String unparseCode(Node codeNode) { StringBuilder sb = new StringBuilder(); while (true) { while (codeNode != null && "br".equals(codeNode.getNodeName())) { // skip <br>s codeNode = codeNode.getNextSibling(); } if (codeNode == null || !"code".equals(codeNode.getNodeName())) { // no following <code> node break; } if (sb.length() > 0) { // use the same ASDoc for setter as for getter: sb.append("/**\n * @private\n */\n"); } sb.append(" "); NodeList childNodes = codeNode.getChildNodes(); for (int j = 0; j < childNodes.getLength(); j++) { Node childNode = childNodes.item(j); String nodeValue = toType(childNode); if (nodeValue == null) { nodeValue = "#text".equals(childNode.getNodeName()) ? childNode.getNodeValue() : childNode.getFirstChild().getNodeValue(); } if (isInterface) { nodeValue = nodeValue.replaceFirst("\\bpublic ", ""); } sb.append(nodeValue); } if (!isInterface && sb.indexOf("function ") != -1) { sb.append(" {\n throw new Error('not implemented'); // TODO: implement!\n }\n"); } else { sb.append(";\n"); } codeNode = codeNode.getNextSibling(); } return sb.toString(); } private String toType(Node aNode) { String href = getAttributeValueIfMatches(aNode, "a", "href"); if (href != null) { if (href.indexOf("specialTypes.html#") != -1) { return aNode.getFirstChild().getNodeValue(); } // types are represented as URLs relative to this class' package URL, but have to be package.Type. // ASDoc's habit of going back to the top level package using ".."s makes things easier: Matcher matcher = RELATIVE_TYPE_URL_PATTERN.matcher(href); if (matcher.matches()) { String typeName = matcher.group(1).replaceAll("/", "."); if (typeName.endsWith(".package-detail")) { return typeName.substring(0, typeName.length() - ".package-detail".length()); } addImport(typeName); String memberName = matcher.group(2); if (memberName != null) { return className.equals(typeName) ? memberName : typeName + memberName; } return typeName; } } return null; } private void addImport(String fullyQualifiedTypeName) { int dotPos = fullyQualifiedTypeName.lastIndexOf('.'); if (dotPos != -1) { String packageName = fullyQualifiedTypeName.substring(0, dotPos); if (!packageName.equals(this.packageName)) { imports.add(fullyQualifiedTypeName); } } } private String getClassDoc() throws XPathExpressionException { XPath xpath = xPathFactory.newXPath(); XPathExpression classDoc = xpath.compile("//*[@id='content']//*[name()='div'][@class='MainContent']"); Node classDocNode = (Node)classDoc.evaluate(document, XPathConstants.NODE); if (classDocNode != null) { NodeList classDocNodes = classDocNode.getChildNodes(); return getASDoc(classDocNodes, null); } return ""; } private String getASDoc(NodeList classDocNodes, Node eventNode) throws XPathExpressionException { if (classDocNodes.getLength() > 0) { StringBuilder writer = new StringBuilder(); writer.append("/**\n"); writer.append(" * "); boolean first = true; String atMode = null; XPathExpression atModeExpression = xPathFactory.newXPath().compile("*[name()='span'][@class='label' or @class='classHeaderTableLabel']"); XPathExpression runtimeOrLanguageVersionExpression = xPathFactory.newXPath().compile(".//*[name()='td']/*[name()='b'][contains(text(),'Language Version')] | .//*[name()='td']/*[name()='b'][contains(text(), 'Runtime Version')]"); for (int i = 0; i < classDocNodes.getLength(); i++) { Node node = classDocNodes.item(i); if (first) { // skip initial <code> element: if ("code".equals(node.getNodeName())) { // also skip following text node: if (node.getNextSibling() != null && "#text".equals(node.getNextSibling().getNodeName())) { ++i; } continue; } else if (isP(node) && node.getFirstChild() == null) { // skip all initial empty <p>s: continue; } } if (IGNORE_UNTIL_BR.equals(atMode)) { if ("br".equals(node.getNodeName())) { atMode = null; } } else if ("@see".equals(atMode)) { appendSeeAlsos(writer, node); atMode = null; } else if ("@example".equals(atMode)) { writer.append("\n * @example ").append(unparse(node, true)); atMode = null; } else if ("@param".equals(atMode) || "@return".equals(atMode) || "@throws".equals(atMode)) { if ("table".equals(node.getNodeName())) { appendParamsOrReturnOrThrows(atMode, writer, node); atMode = null; } } else { if ("classHeaderTable".equals(getAttributeValueIfMatches(node, "table", "class"))) { continue; } Node runtimeOrLanguageVersionNode = (Node)runtimeOrLanguageVersionExpression.evaluate(node, XPathConstants.NODE); if (runtimeOrLanguageVersionNode != null) { atMode = null; // so far, skip runtime and language version information. continue; } Node atModeNode = (Node)atModeExpression.evaluate(node, XPathConstants.NODE); String newAtMode = determineAtMode(atModeNode == null ? node : atModeNode); if (newAtMode != null) { atMode = newAtMode; } else if (!(atMode != null && atMode.startsWith(IGNORE_UNTIL))) { if (eventNode != null && !first && "#text".equals(node.getNodeName())) { // detected copied ASDoc from event type constant, see http://livedocs.adobe.com/flex/3/html/help.html?content=asdoc_4.html "documenting ... events": break; } String nodeValue = unparse(node, first && isP(node)); if (nodeValue.length() > 0) { first = false; writer.append(nodeValue); } } } } if (eventNode != null) { writer.append("\n * @eventType ").append(eventNode.getNodeValue()); } writer.append("\n */\n"); return writer.toString(); } return ""; } private static String determineAtMode(Node atModeNode) { String pStyleClass = getAttributeValueIfMatches(atModeNode, "span", "class"); if ("label".equals(pStyleClass) || "classHeaderTableLabel".equals(pStyleClass)) { String label = atModeNode.getFirstChild().getNodeValue().trim(); return "Parameters".equals(label) ? "@param" : "Returns".equals(label) ? "@return" : "See also".equals(label) ? "@see" : "Throws".equals(label) ? "@throws" : "Implementation".equals(label) ? IGNORE_UNTIL_NEXT_AT_MODE : "Event Object Type:".equals(label) || label.matches("property (.*)\\.type =") ? IGNORE_UNTIL_BR : label.startsWith("Example") ? "@example" : null; } return null; } private void appendSeeAlsos(StringBuilder writer, Node node) { NodeList seeNodes = node.getChildNodes(); for (int j = 0; j < seeNodes.getLength(); j++) { Node seeNode = seeNodes.item(j); if ("a".equals(seeNode.getNodeName())) { String hrefText = seeNode.getAttributes().getNamedItem("href").getNodeValue(); if (hrefText.indexOf("specialTypes.html#") != -1 || hrefText.indexOf("statements.html#") != -1) { hrefText = url.resolve(hrefText).toString(); } String seeText; if (hrefText.startsWith("http://")) { seeText = hrefText + " " + seeNode.getFirstChild().getNodeValue(); } else { seeText = toType(seeNode); } writer.append("\n * @see ").append(seeText); } } writer.append("\n * "); } private void appendParamsOrReturnOrThrows(String atMode, StringBuilder writer, Node node) throws XPathExpressionException { XPathExpression paramNameExpression = xPathFactory.newXPath().compile(".//*[name()='span'][@class='label']"); XPathExpression errorTypeExpression = xPathFactory.newXPath().compile(".//*[name()='a']"); NodeList paramRowNodes = node.getChildNodes(); for (int i = 0; i < paramRowNodes.getLength(); i++) { Node paramRowNode = paramRowNodes.item(i); String firstTdClass = getAttributeValueIfMatches(paramRowNode.getFirstChild(), "td", "class"); if ("paramSpacer".equals(firstTdClass)) { continue; } String key = "@param".equals(atMode) ? ((String)paramNameExpression.evaluate(paramRowNode, XPathConstants.STRING)).trim() : "@throws".equals(atMode) ? toType((Node)errorTypeExpression.evaluate(paramRowNode, XPathConstants.NODE)) : ""; String text = unparse(paramRowNode.getChildNodes().item(1), true); // unparse second td (first is only a spacer) int mdashIndex = text.indexOf("� "); if (mdashIndex != -1) { text = text.substring(mdashIndex + "� ".length()); } writer.append("\n * ").append(atMode).append(" "); if (key.length() > 0) { writer.append(key).append(' '); } writer.append(text); } writer.append("\n * "); } private static boolean isP(Node node) { return "p".equals(node.getNodeName()); } private String unparse(Node node, boolean suppressOuterElement) { String tag = node.getNodeName(); if ("#text".equals(tag)) { return node.getNodeValue(); } if ("br".equals(tag) || "hr".equals(tag)) { return ""; } String divClass = getAttributeValueIfMatches(node, "div", "class"); if ("listing".equals(divClass)) { tag = "listing"; } String childrenUnparsed; if ("listing".equals(tag)) { childrenUnparsed = node.getFirstChild().getFirstChild().getNodeValue(); childrenUnparsed = ("\n" + childrenUnparsed).replaceAll("\n", "\n * "); } else { StringBuilder sb = new StringBuilder(); NodeList childNodes = node.getChildNodes(); for (int i = 0; i < childNodes.getLength(); i++) { Node childNode = childNodes.item(i); sb.append(unparse(childNode, false)); } childrenUnparsed = sb.toString(); } if ("span".equals(tag) || "tbody".equals(tag) || suppressOuterElement || (!"img".equals(tag) && childrenUnparsed.length() == 0)) { return childrenUnparsed; } boolean isBlockElement = "p".equals(tag) || "div".equals(tag) || "ul".equals(tag) || "li".equals(tag) || "listing".equals(tag) || "pre".equals(tag) || "table".equals(tag) || "tr".equals(tag) || "td".equals(tag); StringBuilder sb = new StringBuilder(); if (isBlockElement) { sb.append("\n * "); } sb.append('<').append(tag); String href = getAttributeValueIfMatches(node, "a", "href"); if (href != null) { sb.append(" href=\"").append(url.resolve(href)).append("\""); } String src = getAttributeValueIfMatches(node, "img", "src"); if (src != null) { sb.append(" src=\"").append(url.resolve(src)).append("\" />"); return sb.toString(); } sb.append('>'); sb.append(childrenUnparsed); sb.append("</").append(tag).append('>'); return sb.toString(); } private static String getAttributeValueIfMatches(Node node, String nodeName, String attributeName) { if (nodeName.equals(node.getNodeName())) { Node attributeNode = node.getAttributes().getNamedItem(attributeName); if (attributeNode != null) { return attributeNode.getNodeValue(); } } return null; } private static String htmlTrim(String str) { return str.replaceAll("\u00A0", " ").trim(); } }