/**
* Wordpress Crawler
* Copyright 08.06.2016 by Jigyasa Grover, @jig08
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package org.loklak.api.search;
import java.io.IOException;
import org.json.JSONArray;
import org.json.JSONObject;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.loklak.server.APIException;
import org.loklak.server.APIHandler;
import org.loklak.server.AbstractAPIHandler;
import org.loklak.server.Authorization;
import org.loklak.server.BaseUserRole;
import org.loklak.server.Query;
import org.loklak.susi.SusiThought;
import org.loklak.tools.storage.JSONObjectWithDefault;
import javax.servlet.http.HttpServletResponse;
public class WordpressCrawlerService extends AbstractAPIHandler implements APIHandler {
private static final long serialVersionUID = -5357182691897402354L;
@Override
public String getAPIPath() {
return "/api/wordpresscrawler.json";
}
@Override
public BaseUserRole getMinimalBaseUserRole() {
return BaseUserRole.ANONYMOUS;
}
@Override
public JSONObject getDefaultPermissions(BaseUserRole baseUserRole) {
return null;
}
@Override
public JSONObject serviceImpl(Query call, HttpServletResponse response, Authorization rights, JSONObjectWithDefault permissions)
throws APIException {
String url = call.get("url", "");
return crawlWordpress(url);
}
public static SusiThought crawlWordpress(String blogURL) {
Document blogHTML = null;
Elements articles = null;
Elements articleList_title = null;
Elements articleList_content = null;
Elements articleList_dateTime = null;
Elements articleList_author = null;
String[][] blogPosts = new String[100][4];
// blogPosts[][0] = Blog Title
// blogPosts[][1] = Posted On
// blogPosts[][2] = Author
// blogPosts[][3] = Blog Content
Integer numberOfBlogs = 0;
Integer iterator = 0;
try {
blogHTML = Jsoup.connect(blogURL).get();
} catch (IOException e) {
e.printStackTrace();
}
articles = blogHTML.getElementsByTag("article");
iterator = 0;
for (Element article : articles) {
articleList_title = article.getElementsByClass("entry-title");
for (Element blogs : articleList_title) {
blogPosts[iterator][0] = blogs.text().toString();
}
articleList_dateTime = article.getElementsByClass("posted-on");
for (Element blogs : articleList_dateTime) {
blogPosts[iterator][1] = blogs.text().toString();
}
articleList_author = article.getElementsByClass("byline");
for (Element blogs : articleList_author) {
blogPosts[iterator][2] = blogs.text().toString();
}
articleList_content = article.getElementsByClass("entry-content");
for (Element blogs : articleList_content) {
blogPosts[iterator][3] = blogs.text().toString();
}
iterator++;
}
numberOfBlogs = iterator;
JSONArray blog = new JSONArray();
for (int k = 0; k < numberOfBlogs; k++) {
JSONObject blogpost = new JSONObject();
blogpost.put("blog_url", blogURL);
blogpost.put("title", blogPosts[k][0]);
blogpost.put("posted_on", blogPosts[k][1]);
blogpost.put("author", blogPosts[k][2]);
blogpost.put("content", blogPosts[k][3]);
blog.put(blogpost);
}
SusiThought json = new SusiThought();
json.setData(blog);
return json;
}
}