The Goal
To improve our text analytic project, after integrated OpenNLP with UIMA, we are trying to integrate StanfordNLP NER(Named Entity Recognition) into UIMA.
StanfordNLPAnnotator
Feature Structure: org.apache.uima.stanfordnlp.input:action
We use StanfordNLPAnnotator as the gateway or facade: client uses org.apache.uima.stanfordnlp.input:action to specify what to extract: action=ner - to run named entity extraction or action=sentimet to run sentiment analysis.
We use dynamic output entity: org.apache.uima.stanfordnlp.output, its type specifies whether it's person or organization or etc.
The configuration parameter: ClassifierFile which specifies the mode files NER uses.
package org.lifelongprogrammer.nlp; public class StanfordNLPAnnotator extends JCasAnnotator_ImplBase { public static final String STANFORDNLP_ACTION_NER = "ner"; public static final String TYPE_STANDFORDNLP_OUTPUT = "org.apache.uima.standfordnlp.output"; public static final String FS_STANDFORDNLP_OUTPUT_TYPE = TYPE_STANDFORDNLP_OUTPUT + ":type"; public static final String TYPE_STANFORDNLP_INPUT = "org.apache.uima.stanfordnlp.input"; public static final String FS_STANFORDNLP_INPUT_ACTION = TYPE_STANFORDNLP_INPUT + ":action"; // http://nlp.stanford.edu/software/CRF-NER.shtml private static final Set<String> NER_TYPES = new HashSet<String>( Arrays.asList("PERSON", "ORGANIZATION", "LOCATION", "MISC", "TIME", "MONEY", "PERCENT", "DATE")); private static Splitter splitter = Splitter.on(",").trimResults() .omitEmptyStrings(); public static final String CLASSIFIER_FILE_PARAM = "ClassifierFile"; private CRFClassifier<CoreLabel> crf; private ExecutorService threadpool; private Logger logger; public void initialize(UimaContext aContext) throws ResourceInitializationException { super.initialize(aContext); this.logger = getContext().getLogger(); reconfigure(); } public void reconfigure() throws ResourceInitializationException { try { threadpool = Executors.newCachedThreadPool(); String dataPath = getContext().getDataPath(); String classifierFile = (String) getContext() .getConfigParameterValue(CLASSIFIER_FILE_PARAM); System.out.println(classifierFile); crf = CRFClassifier .getClassifier(new File(dataPath, classifierFile)); } catch (Exception e) { logger.log(Level.SEVERE, e.getMessage()); throw new ResourceInitializationException(e); } } public void process(JCas jcas) throws AnalysisEngineProcessException { CAS cas = jcas.getCas(); ArrayList<String> action = getAction(cas); List<Future<Void>> futures = new ArrayList<Future<Void>>(); if (action.contains(STANFORDNLP_ACTION_NER)) { Future<Void> future = threadpool.submit(new Callable<Void>() { @Override public Void call() throws Exception { getNer(jcas); return null; } }); futures.add(future); } //... for (Future<Void> future : futures) { try { future.get(); } catch (InterruptedException | ExecutionException e) { throw new AnalysisEngineProcessException(e); } } logger.log(Level.FINE, "StanfordNERAnnotator done."); } private ArrayList<String> getAction(CAS cas) { TypeSystem ts = cas.getTypeSystem(); Type dyInputType = ts.getType(TYPE_STANFORDNLP_INPUT); org.apache.uima.cas.Feature dyInputTypesFt = ts .getFeatureByFullName(FS_STANFORDNLP_INPUT_ACTION); FSIterator<?> dyIt = cas.getAnnotationIndex(dyInputType).iterator(); String action = ""; while (dyIt.hasNext()) { // TODO this is kind of weird AnnotationFS afs = (AnnotationFS) dyIt.next(); String str = afs.getStringValue(dyInputTypesFt); if (str != null) { action = str; } } return Lists.newArrayList(splitter.split(action)); } private void getNer(JCas jcas) { CAS cas=jcas.getCas(); String docText = jcas.getDocumentText(); List<List<CoreLabel>> classify = crf.classify(docText); MatchedNER preNER = null; TypeSystem ts = jcas.getTypeSystem(); Type dyOutputType = ts.getType(TYPE_STANDFORDNLP_OUTPUT); org.apache.uima.cas.Feature dyOutputTypeFt = ts .getFeatureByFullName(FS_STANDFORDNLP_OUTPUT_TYPE); // merge co-located same entity for (List<CoreLabel> coreLabels : classify) { for (CoreLabel coreLabel : coreLabels) { String category = coreLabel .get(CoreAnnotations.AnswerAnnotation.class); if (NER_TYPES.contains(category)) { if (preNER == null) { preNER = new MatchedNER(category, coreLabel.beginPosition(), coreLabel.endPosition()); } else if (category.equals(preNER.getCategory())) { preNER = new MatchedNER(category, preNER.getEntityBegin(), coreLabel.endPosition()); } else { // add preNER addNER(preNER, cas, dyOutputType, dyOutputTypeFt); preNER = new MatchedNER(category, coreLabel.beginPosition(), coreLabel.endPosition()); } } else { if (preNER != null) { addNER(preNER, cas, dyOutputType, dyOutputTypeFt); preNER = null; } } } } if (preNER != null) { addNER(preNER, cas, dyOutputType, dyOutputTypeFt); } } private void addNER(MatchedNER preNER, CAS cas, Type dyOutputType, org.apache.uima.cas.Feature dyOutputTypeFt) { AnnotationFS dyAnnFS = cas.createAnnotation(dyOutputType, preNER.getEntityBegin(), preNER.getEntityEnd()); dyAnnFS.setStringValue(dyOutputTypeFt, preNER.getCategory() .toLowerCase()); cas.getIndexRepository().addFS(dyAnnFS); } class MatchedNER { private String cat; private int entityBegin, entityEnd; public MatchedNER(String cat, int entityBegin, int entityEnd) { this.cat = cat; this.entityBegin = entityBegin; this.entityEnd = entityEnd; } } }
Descriptor File: StanfordNLPAnnotator.xml
We define uima types: org.apache.uima.stanfordnlp.input and org.apache.uima.stanfordnlp.output, and the configuration parameter: ClassifierFile.
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier"> <frameworkImplementation>org.apache.uima.java</frameworkImplementation> <primitive>true</primitive> <annotatorImplementationName>org.lifelongprogrammer.nlp.StanfordNLPAnnotator </annotatorImplementationName> <analysisEngineMetaData> <name>StanfordNLPAnnotatorAE</name> <description>StanfordNLPAnnotator Wrapper.</description> <version>1.0</version> <vendor>LifeLong Programmer, Inc.</vendor> <configurationParameters> <configurationParameter> <name>ClassifierFile</name> <description>Filename of the classifier file.</description> <type>String</type> <multiValued>false</multiValued> <mandatory>true</mandatory> </configurationParameter> </configurationParameters> <configurationParameterSettings> <nameValuePair> <name>ClassifierFile</name> <value> <!-- relative to pear resource file --> <string>models\classifiers\english.muc.7class.distsim.crf.ser.gz </string> </value> </nameValuePair> </configurationParameterSettings> <typeSystemDescription> <typeDescription> <name>org.apache.uima.stanfordnlp.input</name> <description /> <supertypeName>uima.tcas.Annotation</supertypeName> <features> <featureDescription> <name>action</name> <description /> <rangeTypeName>uima.cas.String</rangeTypeName> </featureDescription> </features> </typeDescription> <typeDescription> <name>org.apache.uima.standfordnlp.output</name> <description /> <supertypeName>uima.tcas.Annotation</supertypeName> <features> <featureDescription> <name>type</name> <description /> <rangeTypeName>uima.cas.String</rangeTypeName> </featureDescription> </features> </typeDescription> </typeSystemDescription> </analysisEngineDescription>
Annotator Test case
Here we are using sujitpal's UimaUtils.java, it adds the feature org.apache.uima.stanfordnlp.input:action=ner to the CAS then send the case to UIMA server then check the org.apache.uima.stanfordnlp.output feature in the response.
private static final Joiner joiner = Joiner.on(","); @Test public void testStanfordNLPAnnotator() throws Exception { AnalysisEngine ae = UimaUtils.getAE("%ABS_PATH%\StanfordNLPAnnotator.xml", null); for (String input : INPUTS) { JCas jcas = ae.newJCas(); addFSAction(jcas,Lists.newArrayList(StanfordNLPAnnotator.STANFORDNLP_ACTION_NER)); jcas = UimaUtils.runAE(ae, input, UimaUtils.MIMETYPE_TEXT, jcas); Feature feature = jcas.getTypeSystem().getFeatureByFullName( "org.apache.uima.standfordnlp.output:type"); org.apache.uima.cas.TypeSystem ts = jcas.getTypeSystem(); org.apache.uima.cas.Type dyOutputType = ts .getType("org.apache.uima.standfordnlp.output"); FSIndex<? extends Annotation> index = jcas .getAnnotationIndex(dyOutputType); for (Iterator<? extends Annotation> it = index.iterator(); it .hasNext();) { Annotation annotation = it.next(); System.out.println("...(" + annotation.getBegin() + "," + annotation.getEnd() + "): " + annotation.getCoveredText() + ", type: " + annotation.getFeatureValueAsString(feature)); } } ae.destroy(); } private void addFSAction(JCas jcas, List<String> action) { TypeSystem ts = jcas.getTypeSystem(); Feature ft = ts .getFeatureByFullName(StanfordNLPAnnotator.FS_STANFORDNLP_INPUT_ACTION); Type type = ts.getType(StanfordNLPAnnotator.TYPE_STANFORDNLP_INPUT); FeatureStructure fs = jcas.getCas().createFS(type); fs.setStringValue(ft, joiner.join(action)); jcas.addFsToIndexes(fs); }