Programmer: Lifelong Learning: Using Nutch to Extract Anchor Tag and Content

This series talks about how to use Nutch and Solr to implement Google Search's "Jump to" and Anchor links features.
The Problem
In the search result, to help users easily jump to the section uses may be interested, we want to add anchor link below page description. Just like Google Search's "Jump to" and Anchor links features.
Main Steps
1. Extract anchor tag, text and content in Nutch.
This is described in this article and Using HTML Parser Jsoup and Regular Expression to Get Text between Tow Tags and Debugging and Optimizing Regular Expression
2. Save anchor information to Solr.
3. Return Anchor tag and text that matches the query.

Task: Extract anchor tag, text and content in Nutch
We will write a Nutch plugin named index-anchor-content: it implements IndexingFilter extension point.

Its getFields returns a collection that contains WebPage.Field.CONTENT field. This will tell Nutch to read Content field from the underlying data store. Without this step, the WebPage instance in filter(NutchDocument, String, WebPage) method would not have value for content field.

In filter method, we use jsoup to extract all anchor links in div[id=toc] ul>li section.

Then use regular expression <span[^>]*\bid\s*=\s*(?:"|')?{0}(?:'|")?[^>]*>([^<]*)</span>(.*?)<span[^>]*\bid\s*=\s*(?:"|')?{1}(?:'|")?[^>]*>[^<]*</span> to extract tag, text and content for each anchor. {0} and {1} the anchor tag of anchor1 and anchor2.

We then add them into NutchDocument fields: anchorTags, anchorTexts, anchorContents.

Please read more from Using HTML Parser Jsoup and Regex to Extract Text between Tow Tags
Debugging and Optimizing Regular Expression

The detailed step to build nutch plugin are omitted. Please refer to Writing Nutch Plugin Example.
Code

public class AnchorContentIndexingFilter implements IndexingFilter {

  public static final Logger LOG = LoggerFactory
      .getLogger(AnchorContentIndexingFilter.class);
  private Configuration conf;
  private static final Collection<WebPage.Field> FIELDS = new HashSet<WebPage.Field>();
  static {
    FIELDS.add(WebPage.Field.CONTENT);
  }
  private static final String DEFAULT_REGEX_TOC_ANCHOR = "div[id=toc] ul>li a[href^=#]:not([href=#])";
  private static final String DEFAULT_REGEX_PLAIN_ANCHOR_TAG = "a[href^=#]:not([href=#])";

  private static final int DEFAULT_MAX_ANCHOR_LINKS = 20;
  private static final String DEFAULT_FL_ANCHOR_TAGS = "anchorTags",
      DEFAULT_FL_ANCHOR_TEXTS = "anchorTexts",
      DEFAULT_FL_ANCHOR_CONTENTS = "anchorContents",
      DEFAULT_REGEX_BODY_ROOT = "article[id=sectionContent]",
      DEFAULT_REGEX_EXTRACT_CONTENT = "<span[^>]*?\bid\\s*=\\s*(?:\"|')?{0}(?:'|\")?[^>]*>([^<]*)</span>(.*?)<span[^>]*?\bid\\s*=\\s*(?:\"|')?{1}(?:'|\")?[^<]*>([^<]*)</span>";

  private String flAnchorTags, flAnchorTexts, flAnchorContents, regexTocAnchor,
      // if can't find tocAnchor in web page, revert to plainAnchorTag
      regexPlainAnchorTag,
      // if exists, only search content in this section
      regexBodyRoot;

  private boolean extractOtherAnchors = false;
  /**
   * the regex to extract content between two tags: <br>
   * 1. The string must have 2 place holders {0}, {1}, it will be replaced by the
   * anchor name at runtime.<br>
   * 2. There must be 3 regex group, the first group is to extract the text
   * of the first anchor, the second group is to extract content between the two
   * anchors, the third is to extract the text of the second anchor.<br>
   * 3. If ther is single quote ' in the regex string, have to replaced by
   * doubled single quotes '' due to the usage of MessageFormat.check:
   * http://docs.oracle.com/javase/7/docs/api/java/text/MessageFormat.html <br>
   * Check DEFAULT_REGEX_EXTRACT_CONTENT
   */
  private String regexExtractContent = DEFAULT_REGEX_EXTRACT_CONTENT;

  private int maxAnachorLinks = DEFAULT_MAX_ANCHOR_LINKS;
  private MessageFormat MSG_FORMAT;

  @Override
  public NutchDocument filter(NutchDocument doc, String url, WebPage page)
      throws IndexingException {

    ByteBuffer dataBuffer = page.getContent();
    String content = new String(dataBuffer.array());

    Document rootDoc = Jsoup.parse(content);
    try {
      List<Anchor> anchors = parseAnchors(rootDoc);
      for (Anchor anchor : anchors) {
        if (StringUtils.isNotBlank(anchor.getTag())
            && StringUtils.isNotBlank(anchor.getText())
            && StringUtils.isNotBlank(anchor.getContent())) {
          doc.add(flAnchorTags, anchor.getTag());
          doc.add(flAnchorTexts, anchor.getText());
          doc.add(flAnchorContents, anchor.getContent());
        }
      }
    } catch (IOException e) {
      throw new IndexingException(e);
    }
    return doc;
  }

  public List<Anchor> parseAnchors(Document rootDoc) throws IOException {
    List<Anchor> anchorContents = new LinkedList<Anchor>();
    Element rootElement = rootDoc;
    if (regexBodyRoot != null) {
      rootElement = rootDoc.select(regexBodyRoot).first();
    }
    if (rootElement == null)
      return anchorContents;
    Set<String> anchors = getAnchors(rootElement);
    if (anchors.isEmpty())
      return anchorContents;
    StringBuilder remainingTxt = new StringBuilder(rootElement.toString());

    Iterator<String> it = anchors.iterator();
    String curAnchorTag = it.next();
    String lastAnchorTag = null;
    while (it.hasNext() && remainingTxt.length() > 0) {
      String nextAnchorTag = it.next();
      Anchor anchor = getContentBetweenAnchor(remainingTxt, curAnchorTag, nextAnchorTag);
      anchorContents.add(anchor);
      if (!it.hasNext()) {
        // only for last anchor
        lastAnchorTag = anchor.getNextTagText();
      }
      curAnchorTag = nextAnchorTag;
    }
    // Don't forget last tag
    String lastTxt = Jsoup.parse(remainingTxt.toString()).text();
    if (StringUtils.isNotBlank(lastTxt)) {
      anchorContents.add(new Anchor(curAnchorTag, lastAnchorTag, lastTxt));
    }
    return anchorContents;
  }

  public Set<String> getAnchors(Element rootElement) {
    Set<String> anchors = new LinkedHashSet<String>() {
      private static final long serialVersionUID = 1L;

      @Override
      public boolean add(String e) {
        if (size() >= maxAnachorLinks)
          return false;
        return super.add(e);
      }
    };
    getAnchorsImpl(rootElement, regexTocAnchor, anchors);
    if (anchors.isEmpty() && extractOtherAnchors) {
      getAnchorsImpl(rootElement, regexPlainAnchorTag, anchors);
    }
    return anchors;
  }

  public void getAnchorsImpl(Element rootElement, String anchorPattern,
      Set<String> anchors) {
    Elements elements = rootElement.select(anchorPattern);
    if (!elements.isEmpty()) {
      for (Element element : elements) {
        String href = element.attr("href");
        anchors.add(href.substring(1));
      }
    }
  }
  public Anchor getContentBetweenAnchor(StringBuilder remainingTxt,
      String curAnchorTag, String nextAnchorTag) throws IOException {
    Anchor anchor = null;
    String regex = MSG_FORMAT.format(new String[] { curAnchorTag, nextAnchorTag });
    Matcher matcher = Pattern
        .compile(regex, Pattern.DOTALL | Pattern.MULTILINE).matcher(remainingTxt);
    if (matcher.find()) {
      String anchorText = Jsoup.parse(matcher.group(1)).text();
      String anchorContent = anchorText + " "
          + Jsoup.parse(matcher.group(2)).text();
      String nextTagText = matcher.group(3);
      anchor = new Anchor(curAnchorTag, anchorText, anchorContent, nextTagText);

      int g2End = matcher.end(2);
      remainingTxt.delete(0, g2End);
    }
    return anchor;
  }

  @Override
  public Collection<WebPage.Field> getFields() {
    return FIELDS;
  }
  
  private static class Anchor {
    private String tag, text, content,
    // used to get last tag text
    nextTagText;
  }
  public void setConf(Configuration conf) {
    this.conf = conf;
  
    flAnchorTags = getValue(conf, "indexer.anchorContent.field.anchorTags",
        DEFAULT_FL_ANCHOR_TAGS, false);
    flAnchorTexts = getValue(conf, "indexer.anchorContent.field.anchorTags",
        DEFAULT_FL_ANCHOR_TEXTS, false);
    flAnchorContents = getValue(conf,
        "indexer.anchorContent.field.anchorContents",
        DEFAULT_FL_ANCHOR_CONTENTS, false);
    regexTocAnchor = getValue(conf, "indexer.anchorContent.regex.tocAnchor",
        DEFAULT_REGEX_TOC_ANCHOR, false);
    String str = getValue(conf, "indexer.anchorContent.extractOtherAnchors",
        "false", true);
    if (StringUtils.isNotBlank(str)) {
      extractOtherAnchors = Boolean.parseBoolean(str);
    }
    if (extractOtherAnchors) {
      regexPlainAnchorTag = getValue(conf,
          "indexer.anchorContent.regex.plainAnchorTag",
          DEFAULT_REGEX_PLAIN_ANCHOR_TAG, false);
    }
    regexBodyRoot = getValue(conf, "indexer.anchorContent.regex.bodyRoot",
        DEFAULT_REGEX_BODY_ROOT, true);
  
    regexExtractContent = getValue(conf,
        "indexer.anchorContent.regex.extractContent",
        DEFAULT_REGEX_EXTRACT_CONTENT, false);
    MSG_FORMAT = new MessageFormat(regexExtractContent);
  
    str = conf.get("indexer.anchorContent.maxAnchorLinks");
    if (str != null) {
      maxAnachorLinks = Integer.parseInt(str);
    }
  }

  public String getValue(Configuration conf, String param, String oldValue,
      boolean blankable) {
    String newValue = oldValue;
    if (conf.get(param) != null) {
      newValue = conf.get(param);
    }
    if (!blankable && StringUtils.isBlank(newValue)) {
      throw new IllegalArgumentException(newValue + " is set to empty or null.");
    }
    return newValue;
  }
}

Configuration
We update plugin.includes in nutch-site.xml to include this plugin. In solrindex-mapping.xml, we map field in NutchDocument to field in Solr Document.

<field dest="anchorTags" source="anchorTags" />
<field dest="anchorTexts" source="anchorTexts" />
<field dest="anchorContents" source="anchorContents" />

Resource
Using HTML Parser Jsoup and Regex to Extract Text between Tow Tags
Debugging and Optimizing Regular Expression
Writing Nutch Plugin Example

Using Nutch to Extract Anchor Tag and Content

Labels