Programmer: Lifelong Learning: Using lucene-appengine & google-http-java-client to Crawl Blogger on GAE

The Goal
In my latest project, I need develop one GAE java application to crawl blogger siter, and save index into Lucene on GAE.

This post will introduce how to deploy lucene-appengine and use google-http-java-client to parse sitemap.xml to get all posts then crawl each post, then save index to lucene-appengine on GAE, then use GAR cron task to index new posts periodically.

Creating Maven GAE project & Adding Dependencies
First Check GAE: Using Apache Maven to create appengine-skeleton-archetype maven project

Then download lucene-appengine-examples source code, and copy needed dependencies from its pom.xml, and add google-http-client, google-http-client-appengine and google-http-client-xml into pom.xml.

Using google-http-java-client to Parse sitemap.xml
google-http-java-client library allow us to easily convert xml response as java object by com.google.api.client.http.HttpResponse.parseAs(SOmeClass.class), all we need is to define the Java class.

Check blogger's sitemap.xml: lifelongprogrammer sitemap.xml

<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
  <url>
    <loc>http://lifelongprogrammer.blogspot.com/2014/11/using-solr-classifier-to-categorize-articles.html</loc>
    <lastmod>2014-11-04T22:49:54Z</lastmod>
</urlset>

So we can map it to two classes, Urlset and TUrl, the key here is to use @com.google.api.client.util.Key to map java field to element in xml.

public class Urlset {
 @Key
 protected List<TUrl> url = new ArrayList<>();

 public List<TUrl> getUrl() {
  return url;
 }
}
public class TUrl {
 @Key
 protected String loc;
 @Key
 protected String lastmod;
  // omitted the getters
}

Then use the following code to parse sitemap.xml to Urlset java object.

static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport();
static final XmlNamespaceDictionary XML_DICT = new XmlNamespaceDictionary();

HttpRequestFactory requestFactory = HTTP_TRANSPORT
    .createRequestFactory(new HttpRequestInitializer() {
      @Override
      public void initialize(HttpRequest request) {
        request.setParser(new XmlObjectParser(XML_DICT));
      }
    });

HttpRequest request = requestFactory.buildGetRequest(new GenericUrl(sitemapUrl));
HttpResponse response = request.execute();
Urlset urls = response.parseAs(Urlset.class);

When parse each post, we can use the following code to get the post html string:

HttpRequestFactory requestFactory = HTTP_TRANSPORT.createRequestFactory();
HttpRequest request = requestFactory.buildGetRequest(new GenericUrl(url.getLoc()));
HttpResponse response = request.execute();

String html = response.parseAsString();

LAEUtil
The following is the complete code which parse sitemap, then crawl each post and save index into lucene-appengine.

public class LAEUtil {
 private static final Logger logger = LoggerFactory.getLogger(Util.class);
 private static final Version LUCENE_VERSION = Version.LUCENE_4_10_2;

 static final HttpTransport HTTP_TRANSPORT = new NetHttpTransport();
 static final XmlNamespaceDictionary XML_DICT = new XmlNamespaceDictionary();

 public static void crawl(String indexName, String sitemapUrl,
   long maxSeconds) throws IOException {
  Stopwatch stopwatch = Stopwatch.createStarted();
  IndexReader reader = null;
  try (GaeDirectory directory = new GaeDirectory(indexName)) {
   try {
    reader = DirectoryReader.open(directory);
   } catch (IndexNotFoundException e) {
    createIndex(directory);
    reader = DirectoryReader.open(directory);
   }

   IndexSearcher searcher = new IndexSearcher(reader);
   Date crawledMinDate = getCrawledMinMaxDate(searcher, false);
   Date crawlMaxDate = getCrawledMinMaxDate(searcher, true);

   reader.close();
   crawl(directory, stopwatch, indexName, sitemapUrl, crawledMinDate,
     crawlMaxDate, maxSeconds);
  } catch (IOException e) {
   logger.error("crawl failed with error", e);
  }
 }

 private static void createIndex(GaeDirectory directory) throws IOException {
  try (IndexWriter writer = new IndexWriter(directory,
    getIndexWriterConfig(LUCENE_VERSION, getAnalyzer()))) {
  }
 }

 private static Date getCrawledMinMaxDate(IndexSearcher searcher,
   boolean minDate) throws IOException {
  Query q = new MatchAllDocsQuery();
  Date minMaxDate = null;
  boolean reverse = minDate;
  TopFieldDocs docs = searcher.search(q, 1, new Sort(new SortField(
    Fields.LASTMOD, SortField.Type.LONG, reverse)));

  ScoreDoc[] hits = docs.scoreDocs;
  if (hits.length != 0) {
   Document doc = searcher.doc(hits[0].doc);
   minMaxDate = new Date(Long.parseLong(doc.get(Fields.LASTMOD)));
  }
  return minMaxDate;
 }

 /** post between [crawledMinDate to crawledMaxDate] is already crawled  */
 private static void crawl(GaeDirectory directory, Stopwatch stopwatch,
   String indexName, String sitemapUrl, Date crawledMinDate,
   Date crawlMaxDate, long maxSeconds) throws IOException {
  HttpRequestFactory requestFactory = HTTP_TRANSPORT
    .createRequestFactory(new HttpRequestInitializer() {
     @Override
     public void initialize(HttpRequest request) {
      request.setParser(new XmlObjectParser(XML_DICT));
     }
    });

  HttpRequest request = requestFactory.buildGetRequest(new GenericUrl(
    sitemapUrl));

  HttpResponse response = request.execute();
  Urlset urls = response.parseAs(Urlset.class);
  PorterAnalyzer analyzer = getAnalyzer();

  // posts are sorted by lastMod in sitemap.xml
  int added = 0;
  try (IndexWriter writer = new IndexWriter(directory,
    getIndexWriterConfig(LUCENE_VERSION, analyzer))) {

   for (TUrl url : urls.getUrl()) {
    // will not happen
    Date lastmod = url.getLastmodDate();
    if (lastmod == null)  continue;

    if (stopwatch.elapsed(TimeUnit.SECONDS) >= maxSeconds) {
     logger.error("Exceed timelimt " + maxSeconds
       + ", already run "
       + stopwatch.elapsed(TimeUnit.SECONDS) + " seconds");
     break;
    }
    boolean post = false;
    if (crawlMaxDate == null || crawledMinDate == null) {
     post = true;
    }
    if (crawlMaxDate != null && lastmod.after(crawlMaxDate)) {
     post = true;
    } else if (crawledMinDate != null
      && url.getLastmodDate().before(crawledMinDate)) {
     post = true;
    }
    if (post) {
     crawlPost(url, writer);
     ++added;
     if (added == 20) {
      writer.commit();
      added = 0;
     }
    } else {
     logger.debug("ingore " + url + " : lastmod " + lastmod
       + ", crawlMaxDate: " + crawlMaxDate
       + ", crawledMinDate: " + crawledMinDate);
    }
   }
   logger.error("started to commit");
   writer.commit();
   logger.error("commit finished.");
  }
 }

 private static PorterAnalyzer getAnalyzer() {
  return new PorterAnalyzer(LUCENE_VERSION);
 }
  
 private static void crawlPost(TUrl url, IndexWriter writer)
   throws IOException {
  logger.info(url.getLoc() + " : " + url.getLastmod());
  HttpRequestFactory requestFactory = HTTP_TRANSPORT
    .createRequestFactory();
  HttpRequest request = requestFactory.buildGetRequest(new GenericUrl(url
    .getLoc()));
  HttpResponse response = request.execute();

  String html = response.parseAsString();
  Document luceneDoc = new Document();
  luceneDoc.add(new StringField(Fields.ID, url.getLoc(), Store.YES));
  luceneDoc.add(new TextField(Fields.URL, url.getLoc(), Store.YES));

  luceneDoc.add(new TextField(Fields.RAWCONTENT, html, Store.YES));

  ArticleExtractor articleExtractor = ArticleExtractor.getInstance();

  org.jsoup.nodes.Document jsoupDoc = Jsoup.parse(html);
  luceneDoc.add(new TextField(Fields.TITLE, jsoupDoc.title(), Store.YES));

  html = normalize(jsoupDoc);
  try {
   String mainContent = articleExtractor.getText(html);
   luceneDoc.add(new TextField(Fields.MAINCONTENT, mainContent,
     Store.YES));
  } catch (BoilerpipeProcessingException e) {
   throw new RuntimeException(e);
  }
  luceneDoc.add(new LongField(Fields.LASTMOD, url.getLastmodDate()
    .getTime(), Store.YES));
  writer.addDocument(luceneDoc);
 }
}

BloggerCrawler Servlet
We can call BloggerCrawler servlet manually to test our crawler. When we test or call the servlet manully we set maxseconds to some smaller value due to the GAE request handler time limit, when we call it from cron task, we set it to 8 mins(the timelimit for task is 10 mins).

public class BloggerCrawler extends HttpServlet {
 private static final Logger logger = LoggerFactory
   .getLogger(BloggerCrawler.class);
 protected void doGet(HttpServletRequest req, HttpServletResponse resp)
   throws ServletException, IOException {

  String site = Preconditions.checkNotNull(req.getParameter("sitename"),
    "site can't be null");

  String indexName = site;
  if (site.endsWith("blogspot.com")) {
   throw new IllegalArgumentException("not valid sitename: " + site);
  }
  String sitemapUrl = "http://" + site + ".blogspot.com/sitemap.xml";

  int maxseconds = getMaxSeconds(req);
  logger.info("started to crawl " + sitemapUrl);
  Util.crawl(indexName, sitemapUrl, maxseconds);
  super.doGet(req, resp);
 }
 private int getMaxSeconds(HttpServletRequest req) {
  int maxseconds = 40;
  String str = req.getParameter("maxseconds");
  if (str != null) {
   maxseconds = Integer.parseInt(str);
  }
  return maxseconds;
 }
}

Scheduled Crawler with GAE Cron
We can use GAE cron to call crawler servlet periodically, for example every 12 hours. All we need do is add the cron task into cron.xml:
Check Scheduled Tasks With Cron for Java for more about GAE cron.
Notice that Local development server does not execute cron jobs nor have the Cron Jobs link. The actual appengine will show cron jobs and will execute them.

<cronentries>
  <cron>
    <url>/crawl?sitename=lifelongprogrammer&maxseconds=480</url>
    <description>Crawl lifelongprogrammer every 12 hours</description>
    <schedule>every 12 hours</schedule>
  </cron>
</cronentries>

References
lucene-appengine
GAE: Using Apache Maven
Scheduled Tasks With Cron for Java

Using lucene-appengine & google-http-java-client to Crawl Blogger on GAE

Labels