Running Stanford Named Entity Recognition in UIMA

The Goal
To improve our text analytic project, after integrated OpenNLP with UIMA, we are trying to integrate StanfordNLP NER(Named Entity Recognition) into UIMA.

StanfordNLPAnnotator
Feature Structure: org.apache.uima.stanfordnlp.input:action
We use StanfordNLPAnnotator as the gateway or facade: client uses org.apache.uima.stanfordnlp.input:action to specify what to extract: action=ner - to run named entity extraction or action=sentimet to run sentiment analysis.

We use dynamic output entity: org.apache.uima.stanfordnlp.output, its type specifies whether it's person or organization or etc.

The configuration parameter: ClassifierFile which specifies the  mode files NER uses.

package org.lifelongprogrammer.nlp;
public class StanfordNLPAnnotator extends JCasAnnotator_ImplBase {
 public static final String STANFORDNLP_ACTION_NER = "ner";
 public static final String TYPE_STANDFORDNLP_OUTPUT = "org.apache.uima.standfordnlp.output";
 public static final String FS_STANDFORDNLP_OUTPUT_TYPE = TYPE_STANDFORDNLP_OUTPUT
   + ":type";
 public static final String TYPE_STANFORDNLP_INPUT = "org.apache.uima.stanfordnlp.input";
 public static final String FS_STANFORDNLP_INPUT_ACTION = TYPE_STANFORDNLP_INPUT
   + ":action";

 // http://nlp.stanford.edu/software/CRF-NER.shtml
 private static final Set<String> NER_TYPES = new HashSet<String>(
   Arrays.asList("PERSON", "ORGANIZATION", "LOCATION", "MISC", "TIME",
     "MONEY", "PERCENT", "DATE"));
          
 private static Splitter splitter = Splitter.on(",").trimResults()
   .omitEmptyStrings();
 public static final String CLASSIFIER_FILE_PARAM = "ClassifierFile";
 private CRFClassifier<CoreLabel> crf;
 private ExecutorService threadpool;
 private Logger logger;

 public void initialize(UimaContext aContext)
   throws ResourceInitializationException {
  super.initialize(aContext);
  this.logger = getContext().getLogger();
  reconfigure();
 }
 public void reconfigure() throws ResourceInitializationException {
  try {
   threadpool = Executors.newCachedThreadPool();
   String dataPath = getContext().getDataPath();

   String classifierFile = (String) getContext()
     .getConfigParameterValue(CLASSIFIER_FILE_PARAM);
   System.out.println(classifierFile);
   crf = CRFClassifier
     .getClassifier(new File(dataPath, classifierFile));
  } catch (Exception e) {
   logger.log(Level.SEVERE, e.getMessage());
   throw new ResourceInitializationException(e);
  }
 }
  
 public void process(JCas jcas) throws AnalysisEngineProcessException {
  CAS cas = jcas.getCas();
  ArrayList<String> action = getAction(cas);
  List<Future<Void>> futures = new ArrayList<Future<Void>>();
  if (action.contains(STANFORDNLP_ACTION_NER)) {
   Future<Void> future = threadpool.submit(new Callable<Void>() {
    @Override
    public Void call() throws Exception {
     getNer(jcas);
     return null;
    }
   });

   futures.add(future);
  }
    //...
  for (Future<Void> future : futures) {
   try {
    future.get();
   } catch (InterruptedException | ExecutionException e) {
    throw new AnalysisEngineProcessException(e);
   }
  }
  logger.log(Level.FINE, "StanfordNERAnnotator done.");
 }
  
 private ArrayList<String> getAction(CAS cas) {
  TypeSystem ts = cas.getTypeSystem();
  Type dyInputType = ts.getType(TYPE_STANFORDNLP_INPUT);
  org.apache.uima.cas.Feature dyInputTypesFt = ts
    .getFeatureByFullName(FS_STANFORDNLP_INPUT_ACTION);

  FSIterator<?> dyIt = cas.getAnnotationIndex(dyInputType).iterator();
  String action = "";
  while (dyIt.hasNext()) {
   // TODO this is kind of weird
   AnnotationFS afs = (AnnotationFS) dyIt.next();
   String str = afs.getStringValue(dyInputTypesFt);
   if (str != null) {
    action = str;
   }
  }
  return Lists.newArrayList(splitter.split(action));
 }
  
 private void getNer(JCas jcas) {
    CAS cas=jcas.getCas();
  String docText = jcas.getDocumentText();
  List<List<CoreLabel>> classify = crf.classify(docText);

  MatchedNER preNER = null;

  TypeSystem ts = jcas.getTypeSystem();
  Type dyOutputType = ts.getType(TYPE_STANDFORDNLP_OUTPUT);
  org.apache.uima.cas.Feature dyOutputTypeFt = ts
    .getFeatureByFullName(FS_STANDFORDNLP_OUTPUT_TYPE);

  // merge co-located same entity
  for (List<CoreLabel> coreLabels : classify) {
   for (CoreLabel coreLabel : coreLabels) {
    String category = coreLabel
      .get(CoreAnnotations.AnswerAnnotation.class);
    if (NER_TYPES.contains(category)) {
     if (preNER == null) {
      preNER = new MatchedNER(category,
        coreLabel.beginPosition(),
        coreLabel.endPosition());
     } else if (category.equals(preNER.getCategory())) {
      preNER = new MatchedNER(category,
        preNER.getEntityBegin(),
        coreLabel.endPosition());
     } else {
      // add preNER
      addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
      preNER = new MatchedNER(category,
        coreLabel.beginPosition(),
        coreLabel.endPosition());
     }
    } else {
     if (preNER != null) {
      addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
      preNER = null;
     }

    }
   }
  }
  if (preNER != null) {
   addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
  }
 }
 private void addNER(MatchedNER preNER, CAS cas, Type dyOutputType,
   org.apache.uima.cas.Feature dyOutputTypeFt) {
  AnnotationFS dyAnnFS = cas.createAnnotation(dyOutputType,
    preNER.getEntityBegin(), preNER.getEntityEnd());
  dyAnnFS.setStringValue(dyOutputTypeFt, preNER.getCategory()
    .toLowerCase());
  cas.getIndexRepository().addFS(dyAnnFS);
 }

 class MatchedNER {
  private String cat;
  private int entityBegin, entityEnd;

  public MatchedNER(String cat, int entityBegin, int entityEnd) {
   this.cat = cat;
   this.entityBegin = entityBegin;
   this.entityEnd = entityEnd;
  }
 }
}
Descriptor File: StanfordNLPAnnotator.xml
We define uima types: org.apache.uima.stanfordnlp.input and org.apache.uima.stanfordnlp.output, and the configuration parameter: ClassifierFile.
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
 <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
 <primitive>true</primitive>
 <annotatorImplementationName>org.lifelongprogrammer.nlp.StanfordNLPAnnotator
 </annotatorImplementationName>
 <analysisEngineMetaData>
  <name>StanfordNLPAnnotatorAE</name>
  <description>StanfordNLPAnnotator Wrapper.</description>
  <version>1.0</version>
  <vendor>LifeLong Programmer, Inc.</vendor>
  <configurationParameters>
   <configurationParameter>
    <name>ClassifierFile</name>
    <description>Filename of the classifier file.</description>
    <type>String</type>
    <multiValued>false</multiValued>
    <mandatory>true</mandatory>
   </configurationParameter>
  </configurationParameters>
  <configurationParameterSettings>
   <nameValuePair>
    <name>ClassifierFile</name>
    <value>
     <!-- relative to pear resource file -->
     <string>models\classifiers\english.muc.7class.distsim.crf.ser.gz
     </string>
    </value>
   </nameValuePair>
  </configurationParameterSettings>
  <typeSystemDescription>
   <typeDescription>
    <name>org.apache.uima.stanfordnlp.input</name>
    <description />
    <supertypeName>uima.tcas.Annotation</supertypeName>
    <features>
     <featureDescription>
      <name>action</name>
      <description />
      <rangeTypeName>uima.cas.String</rangeTypeName>
     </featureDescription>
    </features>
   </typeDescription>

   <typeDescription>
    <name>org.apache.uima.standfordnlp.output</name>
    <description />
    <supertypeName>uima.tcas.Annotation</supertypeName>
    <features>
     <featureDescription>
      <name>type</name>
      <description />
      <rangeTypeName>uima.cas.String</rangeTypeName>
     </featureDescription>
    </features>
   </typeDescription>
  </typeSystemDescription>
</analysisEngineDescription>
Annotator Test case
Here we are using sujitpal's UimaUtils.java, it adds the feature org.apache.uima.stanfordnlp.input:action=ner to the CAS then send the case to UIMA server then check the org.apache.uima.stanfordnlp.output feature in the response.
private static final Joiner joiner = Joiner.on(",");
@Test
public void testStanfordNLPAnnotator() throws Exception {
  AnalysisEngine ae = UimaUtils.getAE("%ABS_PATH%\StanfordNLPAnnotator.xml", null);
  for (String input : INPUTS) {
    JCas jcas = ae.newJCas();
    addFSAction(jcas,Lists.newArrayList(StanfordNLPAnnotator.STANFORDNLP_ACTION_NER));
    jcas = UimaUtils.runAE(ae, input, UimaUtils.MIMETYPE_TEXT, jcas);

    Feature feature = jcas.getTypeSystem().getFeatureByFullName(
        "org.apache.uima.standfordnlp.output:type");
    org.apache.uima.cas.TypeSystem ts = jcas.getTypeSystem();
    org.apache.uima.cas.Type dyOutputType = ts
        .getType("org.apache.uima.standfordnlp.output");

    FSIndex<? extends Annotation> index = jcas
        .getAnnotationIndex(dyOutputType);
    for (Iterator<? extends Annotation> it = index.iterator(); it
        .hasNext();) {
      Annotation annotation = it.next();
      System.out.println("...(" + annotation.getBegin() + ","
          + annotation.getEnd() + "): "
          + annotation.getCoveredText() + ", type: "
          + annotation.getFeatureValueAsString(feature));
    }
  }
  ae.destroy();
}
private void addFSAction(JCas jcas, List<String> action) {
  TypeSystem ts = jcas.getTypeSystem();
  Feature ft = ts
      .getFeatureByFullName(StanfordNLPAnnotator.FS_STANFORDNLP_INPUT_ACTION);
  Type type = ts.getType(StanfordNLPAnnotator.TYPE_STANFORDNLP_INPUT);

  FeatureStructure fs = jcas.getCas().createFS(type);
  fs.setStringValue(ft, joiner.join(action));
  jcas.addFsToIndexes(fs);
}
Post a Comment

Labels

Java (159) Lucene-Solr (110) All (58) Interview (58) J2SE (53) Algorithm (41) Soft Skills (36) Eclipse (34) Code Example (31) Linux (25) JavaScript (23) Spring (22) Windows (22) Web Development (20) Nutch2 (18) Tools (18) Bugs (17) Debug (15) Defects (14) Text Mining (14) J2EE (13) Network (13) PowerShell (11) Chrome (9) Design (9) How to (9) Learning code (9) Performance (9) UIMA (9) html (9) Continuous Integration (8) Dynamic Languages (8) Http Client (8) Maven (8) Security (8) Trouble Shooting (8) bat (8) blogger (8) Big Data (7) Google (7) Guava (7) JSON (7) Problem Solving (7) ANT (6) Coding Skills (6) Database (6) Scala (6) Shell (6) css (6) Algorithm Series (5) Cache (5) IDE (5) Lesson Learned (5) Programmer Skills (5) System Design (5) Tips (5) adsense (5) xml (5) AIX (4) Code Quality (4) GAE (4) Git (4) Good Programming Practices (4) Jackson (4) Memory Usage (4) Miscs (4) OpenNLP (4) Project Managment (4) Python (4) Spark (4) Testing (4) ads (4) regular-expression (4) Android (3) Apache Spark (3) Become a Better You (3) Concurrency (3) Eclipse RCP (3) English (3) Happy Hacking (3) IBM (3) J2SE Knowledge Series (3) JAX-RS (3) Jetty (3) Restful Web Service (3) Script (3) regex (3) seo (3) .Net (2) Android Studio (2) Apache (2) Apache Procrun (2) Architecture (2) Batch (2) Bit Operation (2) Build (2) Building Scalable Web Sites (2) C# (2) C/C++ (2) CSV (2) Career (2) Cassandra (2) Distributed (2) Fiddler (2) Firefox (2) Google Drive (2) Gson (2) Html Parser (2) Http (2) Image Tools (2) JQuery (2) Jersey (2) LDAP (2) Life (2) Logging (2) Software Issues (2) Storage (2) Text Search (2) xml parser (2) AOP (1) Application Design (1) AspectJ (1) Chrome DevTools (1) Cloud (1) Codility (1) Data Mining (1) Data Structure (1) ExceptionUtils (1) Exif (1) Feature Request (1) FindBugs (1) Greasemonkey (1) HTML5 (1) Httpd (1) I18N (1) IBM Java Thread Dump Analyzer (1) JDK Source Code (1) JDK8 (1) JMX (1) Lazy Developer (1) Mac (1) Machine Learning (1) Mobile (1) My Plan for 2010 (1) Netbeans (1) Notes (1) Operating System (1) Perl (1) Problems (1) Product Architecture (1) Programming Life (1) Quality (1) Redhat (1) Redis (1) Review (1) RxJava (1) Solutions logs (1) Team Management (1) Thread Dump Analyzer (1) Troubleshooting (1) Visualization (1) boilerpipe (1) htm (1) ongoing (1) procrun (1) rss (1)

Popular Posts