Programmer: Lifelong Learning: Running Stanford Named Entity Recognition in UIMA

The Goal

To improve our text analytic project, after integrated OpenNLP with UIMA, we are trying to integrate StanfordNLP NER(Named Entity Recognition) into UIMA.

StanfordNLPAnnotator

Feature Structure: org.apache.uima.stanfordnlp.input:action

We use StanfordNLPAnnotator as the gateway or facade: client uses org.apache.uima.stanfordnlp.input:action to specify what to extract: action=ner - to run named entity extraction or action=sentimet to run sentiment analysis.

We use dynamic output entity: org.apache.uima.stanfordnlp.output, its type specifies whether it's person or organization or etc.

The configuration parameter: ClassifierFile which specifies the mode files NER uses.

package org.lifelongprogrammer.nlp;
public class StanfordNLPAnnotator extends JCasAnnotator_ImplBase {
 public static final String STANFORDNLP_ACTION_NER = "ner";
 public static final String TYPE_STANDFORDNLP_OUTPUT = "org.apache.uima.standfordnlp.output";
 public static final String FS_STANDFORDNLP_OUTPUT_TYPE = TYPE_STANDFORDNLP_OUTPUT
   + ":type";
 public static final String TYPE_STANFORDNLP_INPUT = "org.apache.uima.stanfordnlp.input";
 public static final String FS_STANFORDNLP_INPUT_ACTION = TYPE_STANFORDNLP_INPUT
   + ":action";

 // http://nlp.stanford.edu/software/CRF-NER.shtml
 private static final Set<String> NER_TYPES = new HashSet<String>(
   Arrays.asList("PERSON", "ORGANIZATION", "LOCATION", "MISC", "TIME",
     "MONEY", "PERCENT", "DATE"));
          
 private static Splitter splitter = Splitter.on(",").trimResults()
   .omitEmptyStrings();
 public static final String CLASSIFIER_FILE_PARAM = "ClassifierFile";
 private CRFClassifier<CoreLabel> crf;
 private ExecutorService threadpool;
 private Logger logger;

 public void initialize(UimaContext aContext)
   throws ResourceInitializationException {
  super.initialize(aContext);
  this.logger = getContext().getLogger();
  reconfigure();
 }
 public void reconfigure() throws ResourceInitializationException {
  try {
   threadpool = Executors.newCachedThreadPool();
   String dataPath = getContext().getDataPath();

   String classifierFile = (String) getContext()
     .getConfigParameterValue(CLASSIFIER_FILE_PARAM);
   System.out.println(classifierFile);
   crf = CRFClassifier
     .getClassifier(new File(dataPath, classifierFile));
  } catch (Exception e) {
   logger.log(Level.SEVERE, e.getMessage());
   throw new ResourceInitializationException(e);
  }
 }
  
 public void process(JCas jcas) throws AnalysisEngineProcessException {
  CAS cas = jcas.getCas();
  ArrayList<String> action = getAction(cas);
  List<Future<Void>> futures = new ArrayList<Future<Void>>();
  if (action.contains(STANFORDNLP_ACTION_NER)) {
   Future<Void> future = threadpool.submit(new Callable<Void>() {
    @Override
    public Void call() throws Exception {
     getNer(jcas);
     return null;
    }
   });

   futures.add(future);
  }
    //...
  for (Future<Void> future : futures) {
   try {
    future.get();
   } catch (InterruptedException | ExecutionException e) {
    throw new AnalysisEngineProcessException(e);
   }
  }
  logger.log(Level.FINE, "StanfordNERAnnotator done.");
 }
  
 private ArrayList<String> getAction(CAS cas) {
  TypeSystem ts = cas.getTypeSystem();
  Type dyInputType = ts.getType(TYPE_STANFORDNLP_INPUT);
  org.apache.uima.cas.Feature dyInputTypesFt = ts
    .getFeatureByFullName(FS_STANFORDNLP_INPUT_ACTION);

  FSIterator<?> dyIt = cas.getAnnotationIndex(dyInputType).iterator();
  String action = "";
  while (dyIt.hasNext()) {
   // TODO this is kind of weird
   AnnotationFS afs = (AnnotationFS) dyIt.next();
   String str = afs.getStringValue(dyInputTypesFt);
   if (str != null) {
    action = str;
   }
  }
  return Lists.newArrayList(splitter.split(action));
 }
  
 private void getNer(JCas jcas) {
    CAS cas=jcas.getCas();
  String docText = jcas.getDocumentText();
  List<List<CoreLabel>> classify = crf.classify(docText);

  MatchedNER preNER = null;

  TypeSystem ts = jcas.getTypeSystem();
  Type dyOutputType = ts.getType(TYPE_STANDFORDNLP_OUTPUT);
  org.apache.uima.cas.Feature dyOutputTypeFt = ts
    .getFeatureByFullName(FS_STANDFORDNLP_OUTPUT_TYPE);

  // merge co-located same entity
  for (List<CoreLabel> coreLabels : classify) {
   for (CoreLabel coreLabel : coreLabels) {
    String category = coreLabel
      .get(CoreAnnotations.AnswerAnnotation.class);
    if (NER_TYPES.contains(category)) {
     if (preNER == null) {
      preNER = new MatchedNER(category,
        coreLabel.beginPosition(),
        coreLabel.endPosition());
     } else if (category.equals(preNER.getCategory())) {
      preNER = new MatchedNER(category,
        preNER.getEntityBegin(),
        coreLabel.endPosition());
     } else {
      // add preNER
      addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
      preNER = new MatchedNER(category,
        coreLabel.beginPosition(),
        coreLabel.endPosition());
     }
    } else {
     if (preNER != null) {
      addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
      preNER = null;
     }

    }
   }
  }
  if (preNER != null) {
   addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
  }
 }
 private void addNER(MatchedNER preNER, CAS cas, Type dyOutputType,
   org.apache.uima.cas.Feature dyOutputTypeFt) {
  AnnotationFS dyAnnFS = cas.createAnnotation(dyOutputType,
    preNER.getEntityBegin(), preNER.getEntityEnd());
  dyAnnFS.setStringValue(dyOutputTypeFt, preNER.getCategory()
    .toLowerCase());
  cas.getIndexRepository().addFS(dyAnnFS);
 }

 class MatchedNER {
  private String cat;
  private int entityBegin, entityEnd;

  public MatchedNER(String cat, int entityBegin, int entityEnd) {
   this.cat = cat;
   this.entityBegin = entityBegin;
   this.entityEnd = entityEnd;
  }
 }
}

Descriptor File: StanfordNLPAnnotator.xml

We define uima types: org.apache.uima.stanfordnlp.input and org.apache.uima.stanfordnlp.output, and the configuration parameter: ClassifierFile.

<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
 <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
 <primitive>true</primitive>
 <annotatorImplementationName>org.lifelongprogrammer.nlp.StanfordNLPAnnotator
 </annotatorImplementationName>
 <analysisEngineMetaData>
  <name>StanfordNLPAnnotatorAE</name>
  <description>StanfordNLPAnnotator Wrapper.</description>
  <version>1.0</version>
  <vendor>LifeLong Programmer, Inc.</vendor>
  <configurationParameters>
   <configurationParameter>
    <name>ClassifierFile</name>
    <description>Filename of the classifier file.</description>
    <type>String</type>
    <multiValued>false</multiValued>
    <mandatory>true</mandatory>
   </configurationParameter>
  </configurationParameters>
  <configurationParameterSettings>
   <nameValuePair>
    <name>ClassifierFile</name>
    <value>
     <!-- relative to pear resource file -->
     <string>models\classifiers\english.muc.7class.distsim.crf.ser.gz
     </string>
    </value>
   </nameValuePair>
  </configurationParameterSettings>
  <typeSystemDescription>
   <typeDescription>
    <name>org.apache.uima.stanfordnlp.input</name>
    <description />
    <supertypeName>uima.tcas.Annotation</supertypeName>
    <features>
     <featureDescription>
      <name>action</name>
      <description />
      <rangeTypeName>uima.cas.String</rangeTypeName>
     </featureDescription>
    </features>
   </typeDescription>

   <typeDescription>
    <name>org.apache.uima.standfordnlp.output</name>
    <description />
    <supertypeName>uima.tcas.Annotation</supertypeName>
    <features>
     <featureDescription>
      <name>type</name>
      <description />
      <rangeTypeName>uima.cas.String</rangeTypeName>
     </featureDescription>
    </features>
   </typeDescription>
  </typeSystemDescription>
</analysisEngineDescription>

Annotator Test case

Here we are using sujitpal's UimaUtils.java, it adds the feature org.apache.uima.stanfordnlp.input:action=ner to the CAS then send the case to UIMA server then check the org.apache.uima.stanfordnlp.output feature in the response.

private static final Joiner joiner = Joiner.on(",");
@Test
public void testStanfordNLPAnnotator() throws Exception {
  AnalysisEngine ae = UimaUtils.getAE("%ABS_PATH%\StanfordNLPAnnotator.xml", null);
  for (String input : INPUTS) {
    JCas jcas = ae.newJCas();
    addFSAction(jcas,Lists.newArrayList(StanfordNLPAnnotator.STANFORDNLP_ACTION_NER));
    jcas = UimaUtils.runAE(ae, input, UimaUtils.MIMETYPE_TEXT, jcas);

    Feature feature = jcas.getTypeSystem().getFeatureByFullName(
        "org.apache.uima.standfordnlp.output:type");
    org.apache.uima.cas.TypeSystem ts = jcas.getTypeSystem();
    org.apache.uima.cas.Type dyOutputType = ts
        .getType("org.apache.uima.standfordnlp.output");

    FSIndex<? extends Annotation> index = jcas
        .getAnnotationIndex(dyOutputType);
    for (Iterator<? extends Annotation> it = index.iterator(); it
        .hasNext();) {
      Annotation annotation = it.next();
      System.out.println("...(" + annotation.getBegin() + ","
          + annotation.getEnd() + "): "
          + annotation.getCoveredText() + ", type: "
          + annotation.getFeatureValueAsString(feature));
    }
  }
  ae.destroy();
}
private void addFSAction(JCas jcas, List<String> action) {
  TypeSystem ts = jcas.getTypeSystem();
  Feature ft = ts
      .getFeatureByFullName(StanfordNLPAnnotator.FS_STANFORDNLP_INPUT_ACTION);
  Type type = ts.getType(StanfordNLPAnnotator.TYPE_STANFORDNLP_INPUT);

  FeatureStructure fs = jcas.getCas().createFS(type);
  fs.setStringValue(ft, joiner.join(action));
  jcas.addFsToIndexes(fs);
}