package io.monokkel.core; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import io.monokkel.core.api.ResponseParser; import io.monokkel.domain.PageData; import io.monokkel.exceptions.ParseException; import org.apache.commons.lang.StringUtils; import org.json.simple.JSONObject; import org.json.simple.parser.ContainerFactory; import org.json.simple.parser.JSONParser; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.util.*; import java.util.stream.Collectors; import static io.monokkel.core.utils.ParserUtils.doesTheListHaveHeader; import static java.lang.String.format; /** * Created by Tarjei on 25/06/14. * <p/> * This class is a JSON parser that receives a json string and returns extracted content * based on a simple expression path. The returned extracted content is always wrapped as a json object. * The path is added as a array in the constructor. * <p/> * A few examples (json is unescaped): * <p/> * ===== Example 1 ===== * <p/> * String json = "{ * "data": { * "field": "This is a field" * }, * "list":[ * { * "element": "element" * } * ] * }"; * <p/> * ArrayList<String> path = new ArrayList<String>(); * path.add("data"); * path.add("field"); * <p/> * JsonTransformer jsonParser = new JsonTransformer(path,..); * ParseOutput parseOutput = jsonParser.parse("http://url",json,1L); * parseOutput.getExtractedContent(); // Returns the string {"field":"This is a field"} * <p/> * ===== Example 2 ===== * <p/> * ArrayList<String> path = new ArrayList<String>(); * path.add("list"); * path.add("0"); // Index of the list * <p/> * JsonTransformer jsonParser = new JsonTransformer(path,..); * ParseOutput parseOutput = jsonParser.parse("http://url",json,1L); * parseOutput.getExtractedContent(); // Returns the string "{ element: "element" }" * <p/> * ===== Example 3 ===== * <p/> * ArrayList<String> path = new ArrayList<String>(); * path.add("list"); * <p/> * JsonTransformer jsonParser = new JsonTransformer(path,..); * ParseOutput parseOutput = jsonParser.parse("http://url",json,1L); * parseOutput.getExtractedContent(); // Returns the string "{"list":[{ element: "element" }]}" * <p/> * The second parameter in the constructor do the exact same but is intended to return urls to the crawler * */ public class JsonTransformer extends JsonSupport implements ResponseParser { private final Map<String, List<String>> fieldPathsToRetrieve; public JsonTransformer(final Map<String, List<String>> fieldsPathsToRetrieve, final List<String> fieldToFiendNextUrl) { super(fieldToFiendNextUrl); if (fieldsPathsToRetrieve.size() < 1) { throw new IllegalArgumentException("You need a json path longer than 1 hop"); } this.fieldPathsToRetrieve = fieldsPathsToRetrieve; } @Override protected Map<String, Object> transformContent(final Map<String, Object> parse, final String url, final Long timeStamp) { return fieldPathsToRetrieve.entrySet().stream().collect(Collectors.toMap(Map.Entry::getKey, entry -> extractContent(parse, 0, url, entry.getValue(), true))); } }