Programmer: Lifelong Learning: Extend Nutch2 to Get Outlinks from COOLjsTree Javascript File

We use Nutch2 to crawl one documentation site, and store the index to Solr4.x to implement documentation search function.

But I met one problem: the documentation site uses COOLjsTree, in htmp paghes it defines the left side menu in tree_nodes.js.

END_USER: {
  NODES: [
   ["End User 1", "../../products/end_user1.htm", "_top"],
   ["End User 2", "../../products/end_user2.htm", "_top"],
  ],
  TITLE: " End-User"
}

Nutch2 provides parse-js plugin to find outlinks defined in javascript file or embedded javascript section.
It uses the following regular expression to find outlinks:

org.apache.nutch.parse.js.JSParseFilter
  private static final String STRING_PATTERN = "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)";
  private static final String URI_PATTERN = "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)";

It can find links like: http://site.com/folder/pagea.html. But it doesn't work for the links we defined in our tree_nodes.js.

But luckily, we can easily write our own Nutch plugin to modify or extend nutch.

We can create our own parse-tree-nodes-js plugin, write our own ParseFilter and Parser to parse outlinks from our tree_nodes.js file.
Implementation Code
First check whether it is a javascript file end with tree_nodes.js, if so get links from the file via the regulare pattern like below: "*.htm|html|pdf"

private static final String URL_PATTERN_IN_TREE_NODE_JS = "\"([^\"]*.[htm|html|pdf])\"";

package org.jefferyyuan.codeexample.nutch.parse.js.treenodes;

public class TreeNodesJSParseFilter implements ParseFilter, Parser {
  private static final int MAX_TITLE_LEN = 80;
  private static final String ABSOLUTE_URL_PATTERN_STR = "^[http|https|www].*";
  private static final String CV_TREE_NODE_LINK_PATTERN_STR = "\"([^\"]*.[htm|html|pdf])\"";
  private static final PatternCompiler patternCompiler = new Perl5Compiler();
  private static Pattern ABSOLUTE_URL_PATTERN, CV_TREE_NODE_LINK_PATTERN;

  static {
    try {
      ABSOLUTE_URL_PATTERN = patternCompiler.compile(ABSOLUTE_URL_PATTERN_STR,
          Perl5Compiler.CASE_INSENSITIVE_MASK | Perl5Compiler.READ_ONLY_MASK
              | Perl5Compiler.SINGLELINE_MASK);
      CV_TREE_NODE_LINK_PATTERN = patternCompiler.compile(
          CV_TREE_NODE_LINK_PATTERN_STR, Perl5Compiler.CASE_INSENSITIVE_MASK
              | Perl5Compiler.READ_ONLY_MASK | Perl5Compiler.MULTILINE_MASK);
    } catch (MalformedPatternException e) {
      e.printStackTrace();
    }
  }
  @Override
  public Parse filter(String url, WebPage page, Parse parse,
      HTMLMetaTags metaTags, DocumentFragment doc) {
    if (shouldHandle(page)) {
      ArrayList<Outlink> outlinks = new ArrayList<Outlink>();

      walk(doc, parse, metaTags, url, outlinks);
      if (outlinks.size() > 0) {
        Outlink[] old = parse.getOutlinks();
        String title = parse.getTitle();
        List<Outlink> list = Arrays.asList(old);
        outlinks.addAll(list);
        ParseStatus status = parse.getParseStatus();
        String text = parse.getText();
        Outlink[] newlinks = outlinks.toArray(new Outlink[outlinks.size()]);
        return new Parse(text, title, newlinks, status);
      }
    }
    return parse;
  }

  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base,
      List<Outlink> outlinks) {
    if (n instanceof Element) {
      String name = n.getNodeName();
      if (name.equalsIgnoreCase("script")) {
        @SuppressWarnings("unused")
        String lang = null;
        Node lNode = n.getAttributes().getNamedItem("language");
        if (lNode == null)
          lang = "javascript";
        else
          lang = lNode.getNodeValue();
        StringBuffer script = new StringBuffer();
        NodeList nn = n.getChildNodes();
        if (nn.getLength() > 0) {
          for (int i = 0; i < nn.getLength(); i++) {
            if (i > 0)
              script.append('\n');
            script.append(nn.item(i).getNodeValue());
          }
          // This logging makes the output very messy.
          // if (LOG.isInfoEnabled()) {
          // LOG.info("script: language=" + lang + ", text: " +
          // script.toString());
          // }
          Outlink[] links = getJSLinks(script.toString(), "", base);
          if (links != null && links.length > 0)
            outlinks.addAll(Arrays.asList(links));
          // no other children of interest here, go one level up.
          return;
        }
      } else {
        // process all HTML 4.0 events, if present...
        NamedNodeMap attrs = n.getAttributes();
        int len = attrs.getLength();
        for (int i = 0; i < len; i++) {
          // Window: onload,onunload
          // Form: onchange,onsubmit,onreset,onselect,onblur,onfocus
          // Keyboard: onkeydown,onkeypress,onkeyup
          // Mouse:
          // onclick,ondbclick,onmousedown,onmouseout,onmousover,onmouseup
          Node anode = attrs.item(i);
          Outlink[] links = null;
          if (anode.getNodeName().startsWith("on")) {
            links = getJSLinks(anode.getNodeValue(), "", base);
          } else if (anode.getNodeName().equalsIgnoreCase("href")) {
            String val = anode.getNodeValue();
            if (val != null && val.toLowerCase().indexOf("javascript:") != -1) {
              links = getJSLinks(val, "", base);
            }
          }
          if (links != null && links.length > 0)
            outlinks.addAll(Arrays.asList(links));
        }
      }
    }
    NodeList nl = n.getChildNodes();
    for (int i = 0; i < nl.getLength(); i++) {
      walk(nl.item(i), parse, metaTags, base, outlinks);
    }
  }

  private boolean shouldHandle(WebPage page) {
    boolean shouldHandle = false;

    String url = TableUtil.toString(page.getBaseUrl());
    if (url != null && url.endsWith("tree_nodes.js")) {
      shouldHandle = true;
    }
    return shouldHandle;
  }

  @Override
  public Parse getParse(String url, WebPage page) {
    if (!shouldHandle(page)) {
      return ParseStatusUtils.getEmptyParse(
          ParseStatusCodes.FAILED_INVALID_FORMAT, "Content not JavaScript: '"
              + TableUtil.toString(page.getContentType()) + "'", getConf());
    }
    String script = new String(page.getContent().array());
    Outlink[] outlinks = getJSLinks(script, "", url);
    if (outlinks == null)
      outlinks = new Outlink[0];
    // Title? use the first line of the script...
    String title;
    int idx = script.indexOf('\n');
    if (idx != -1) {
      if (idx > MAX_TITLE_LEN)
        idx = MAX_TITLE_LEN;
      title = script.substring(0, idx);
    } else {
      idx = Math.min(MAX_TITLE_LEN, script.length());
      title = script.substring(0, idx);
    }
    Parse parse = new Parse(script, title, outlinks,
        ParseStatusUtils.STATUS_SUCCESS);
    return parse;
  }

  /**
   * This method extracts URLs from literals embedded in JavaScript.
   */
  private static Outlink[] getJSLinks(String plainText, String anchor,
      String base) {
    long start = System.currentTimeMillis();

    // the base is always absolute path: http://.../tree_nodes.js, remve last file name
    base = base.substring(0, base.lastIndexOf('/'));
    final List<Outlink> outlinks = new ArrayList<Outlink>();
    URL baseURL = null;

    try {
      baseURL = new URL(base);
    } catch (Exception e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("error assigning base URL", e);
      }
    }

    try {
      final PatternMatcher matcher = new Perl5Matcher();
      final PatternMatcherInput input = new PatternMatcherInput(plainText);

      MatchResult result;
      String url;
      // loop the matches
      while (matcher.contains(input, CV_TREE_NODE_LINK_PATTERN)) {
        // if this is taking too long, stop matching
        // (SHOULD really check cpu time used so that heavily loaded systems
        // do not unnecessarily hit this limit.)
        if (System.currentTimeMillis() - start >= 60000L) {
          if (LOG.isWarnEnabled()) {
            LOG.warn("Time limit exceeded for getJSLinks");
          }
          break;
        }
        result = matcher.getMatch();
        url = result.group(1);
        // See if candidate URL is parseable. If not, pass and move on to
        // the next match.
        try {
          url = new URL(toAbsolutePath(base, url)).toString();
          LOG.info("Extension added: " + url + " and baseURL " + baseURL);
        } catch (MalformedURLException ex) {
          if (LOG.isTraceEnabled()) {
            LOG.trace("Extension - failed URL parse '" + url + "' and baseURL '"
              + baseURL + "'", ex);
          }
          continue;
        }
        try {
          outlinks.add(new Outlink(url.toString(), anchor));
        } catch (MalformedURLException mue) {
          LOG.warn("Extension Invalid url: '" + url + "', skipping.");
        }
      }
    } catch (Exception ex) {
      if (LOG.isErrorEnabled()) {
        LOG.error("getJSLinks", ex);
      }
    }

    final Outlink[] retval;

    // create array of the Outlinks
    if (outlinks != null && outlinks.size() > 0) {
      retval = outlinks.toArray(new Outlink[0]);
    } else {
      retval = new Outlink[0];
    }

    return retval;
  }

  private static String toAbsolutePath(String baseUrl, String path)
      throws MalformedPatternException {
    final PatternMatcher matcher = new Perl5Matcher();

    final PatternMatcherInput input = new PatternMatcherInput(path);
    boolean isAbsolute = false;

    if (matcher.matches(input, ABSOLUTE_URL_PATTERN)) {
      isAbsolute = true;
    }

    if (isAbsolute) {
      return path;
    }
    while (true) {
      if (!path.startsWith("../")) {
        break;
      }
      baseUrl = baseUrl.substring(0, baseUrl.lastIndexOf('/'));
      path = path.substring(3);
    }
    // now relativePath is foldera/fileb, no /

    return baseUrl + "/" + path;
  }
}

Configuration
Then we need to include parse-tree-nodes-js in nutch-site.xml


    plugin.includes
    protocol-http|urlfilter-regex|parse-tree-nodes-js|parse-(html|tika|metatags)|index-(basic|static|metadata|anchor)
|urlnormalizer-(pass|regex|basic)|scoring-opic|subcollection

Then change parse-plugins.xml to make nutch use parse-tree-nodes-js plugin to parse javascript file.

Then we need change regex-urlfilter.txt to make nutch handle javascript file: to remove |js|JS from the following section.
#-\.(gif|GIF|jpg|JPG|png|PNG|ico|ICO|css|CSS|sit|SIT|eps|EPS|wmf|WMF|zip|ZIP|ppt|PPT|mpg|MPG|xls|XLS|gz|GZ|rpm|RPM|tgz|TGZ|mov|MOV|exe|EXE|jpeg|JPEG|bmp|BMP|js|JS)$
At last, as we don't need store content from javascript file to Solr, we can either write a Solr UpdateRequestProcessor to ignore the document if the value of url field is ended with .js, or we can change org.apache.nutch.indexer.solr.SolrWriter.write(NutchDocument) like below:

public void write(NutchDocument doc) throws IOException {
    String urlValue = doc.getFieldValue("url");
    if(urlValue!=null && urlValue.endsWith(".js"))
    {
      LOG.trace("Extension ignore js file: " + urlValue);
      return;
    }
...
}

References
http://wiki.apache.org/nutch/AboutPlugins
http://wiki.apache.org/nutch/WritingPluginExample
http://florianhartl.com/nutch-plugin-tutorial.html

Extend Nutch2 to Get Outlinks from COOLjsTree Javascript File

Labels