The Goal
To improve our text analytic project, after integrated OpenNLP with UIMA, we are trying to integrate StanfordNLP NER(Named Entity Recognition) into UIMA.
StanfordNLPAnnotator
Feature Structure: org.apache.uima.stanfordnlp.input:action
We use StanfordNLPAnnotator as the gateway or facade: client uses org.apache.uima.stanfordnlp.input:action to specify what to extract: action=ner - to run named entity extraction or action=sentimet to run sentiment analysis.
We use dynamic output entity: org.apache.uima.stanfordnlp.output, its type specifies whether it's person or organization or etc.
The configuration parameter: ClassifierFile which specifies the mode files NER uses.
package org.lifelongprogrammer.nlp;
public class StanfordNLPAnnotator extends JCasAnnotator_ImplBase {
public static final String STANFORDNLP_ACTION_NER = "ner";
public static final String TYPE_STANDFORDNLP_OUTPUT = "org.apache.uima.standfordnlp.output";
public static final String FS_STANDFORDNLP_OUTPUT_TYPE = TYPE_STANDFORDNLP_OUTPUT
+ ":type";
public static final String TYPE_STANFORDNLP_INPUT = "org.apache.uima.stanfordnlp.input";
public static final String FS_STANFORDNLP_INPUT_ACTION = TYPE_STANFORDNLP_INPUT
+ ":action";
// http://nlp.stanford.edu/software/CRF-NER.shtml
private static final Set<String> NER_TYPES = new HashSet<String>(
Arrays.asList("PERSON", "ORGANIZATION", "LOCATION", "MISC", "TIME",
"MONEY", "PERCENT", "DATE"));
private static Splitter splitter = Splitter.on(",").trimResults()
.omitEmptyStrings();
public static final String CLASSIFIER_FILE_PARAM = "ClassifierFile";
private CRFClassifier<CoreLabel> crf;
private ExecutorService threadpool;
private Logger logger;
public void initialize(UimaContext aContext)
throws ResourceInitializationException {
super.initialize(aContext);
this.logger = getContext().getLogger();
reconfigure();
}
public void reconfigure() throws ResourceInitializationException {
try {
threadpool = Executors.newCachedThreadPool();
String dataPath = getContext().getDataPath();
String classifierFile = (String) getContext()
.getConfigParameterValue(CLASSIFIER_FILE_PARAM);
System.out.println(classifierFile);
crf = CRFClassifier
.getClassifier(new File(dataPath, classifierFile));
} catch (Exception e) {
logger.log(Level.SEVERE, e.getMessage());
throw new ResourceInitializationException(e);
}
}
public void process(JCas jcas) throws AnalysisEngineProcessException {
CAS cas = jcas.getCas();
ArrayList<String> action = getAction(cas);
List<Future<Void>> futures = new ArrayList<Future<Void>>();
if (action.contains(STANFORDNLP_ACTION_NER)) {
Future<Void> future = threadpool.submit(new Callable<Void>() {
@Override
public Void call() throws Exception {
getNer(jcas);
return null;
}
});
futures.add(future);
}
//...
for (Future<Void> future : futures) {
try {
future.get();
} catch (InterruptedException | ExecutionException e) {
throw new AnalysisEngineProcessException(e);
}
}
logger.log(Level.FINE, "StanfordNERAnnotator done.");
}
private ArrayList<String> getAction(CAS cas) {
TypeSystem ts = cas.getTypeSystem();
Type dyInputType = ts.getType(TYPE_STANFORDNLP_INPUT);
org.apache.uima.cas.Feature dyInputTypesFt = ts
.getFeatureByFullName(FS_STANFORDNLP_INPUT_ACTION);
FSIterator<?> dyIt = cas.getAnnotationIndex(dyInputType).iterator();
String action = "";
while (dyIt.hasNext()) {
// TODO this is kind of weird
AnnotationFS afs = (AnnotationFS) dyIt.next();
String str = afs.getStringValue(dyInputTypesFt);
if (str != null) {
action = str;
}
}
return Lists.newArrayList(splitter.split(action));
}
private void getNer(JCas jcas) {
CAS cas=jcas.getCas();
String docText = jcas.getDocumentText();
List<List<CoreLabel>> classify = crf.classify(docText);
MatchedNER preNER = null;
TypeSystem ts = jcas.getTypeSystem();
Type dyOutputType = ts.getType(TYPE_STANDFORDNLP_OUTPUT);
org.apache.uima.cas.Feature dyOutputTypeFt = ts
.getFeatureByFullName(FS_STANDFORDNLP_OUTPUT_TYPE);
// merge co-located same entity
for (List<CoreLabel> coreLabels : classify) {
for (CoreLabel coreLabel : coreLabels) {
String category = coreLabel
.get(CoreAnnotations.AnswerAnnotation.class);
if (NER_TYPES.contains(category)) {
if (preNER == null) {
preNER = new MatchedNER(category,
coreLabel.beginPosition(),
coreLabel.endPosition());
} else if (category.equals(preNER.getCategory())) {
preNER = new MatchedNER(category,
preNER.getEntityBegin(),
coreLabel.endPosition());
} else {
// add preNER
addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
preNER = new MatchedNER(category,
coreLabel.beginPosition(),
coreLabel.endPosition());
}
} else {
if (preNER != null) {
addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
preNER = null;
}
}
}
}
if (preNER != null) {
addNER(preNER, cas, dyOutputType, dyOutputTypeFt);
}
}
private void addNER(MatchedNER preNER, CAS cas, Type dyOutputType,
org.apache.uima.cas.Feature dyOutputTypeFt) {
AnnotationFS dyAnnFS = cas.createAnnotation(dyOutputType,
preNER.getEntityBegin(), preNER.getEntityEnd());
dyAnnFS.setStringValue(dyOutputTypeFt, preNER.getCategory()
.toLowerCase());
cas.getIndexRepository().addFS(dyAnnFS);
}
class MatchedNER {
private String cat;
private int entityBegin, entityEnd;
public MatchedNER(String cat, int entityBegin, int entityEnd) {
this.cat = cat;
this.entityBegin = entityBegin;
this.entityEnd = entityEnd;
}
}
}
Descriptor File: StanfordNLPAnnotator.xml
We define uima types: org.apache.uima.stanfordnlp.input and org.apache.uima.stanfordnlp.output, and the configuration parameter: ClassifierFile.
<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
<frameworkImplementation>org.apache.uima.java</frameworkImplementation>
<primitive>true</primitive>
<annotatorImplementationName>org.lifelongprogrammer.nlp.StanfordNLPAnnotator
</annotatorImplementationName>
<analysisEngineMetaData>
<name>StanfordNLPAnnotatorAE</name>
<description>StanfordNLPAnnotator Wrapper.</description>
<version>1.0</version>
<vendor>LifeLong Programmer, Inc.</vendor>
<configurationParameters>
<configurationParameter>
<name>ClassifierFile</name>
<description>Filename of the classifier file.</description>
<type>String</type>
<multiValued>false</multiValued>
<mandatory>true</mandatory>
</configurationParameter>
</configurationParameters>
<configurationParameterSettings>
<nameValuePair>
<name>ClassifierFile</name>
<value>
<!-- relative to pear resource file -->
<string>models\classifiers\english.muc.7class.distsim.crf.ser.gz
</string>
</value>
</nameValuePair>
</configurationParameterSettings>
<typeSystemDescription>
<typeDescription>
<name>org.apache.uima.stanfordnlp.input</name>
<description />
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>action</name>
<description />
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
<typeDescription>
<name>org.apache.uima.standfordnlp.output</name>
<description />
<supertypeName>uima.tcas.Annotation</supertypeName>
<features>
<featureDescription>
<name>type</name>
<description />
<rangeTypeName>uima.cas.String</rangeTypeName>
</featureDescription>
</features>
</typeDescription>
</typeSystemDescription>
</analysisEngineDescription>
Annotator Test case
Here we are using sujitpal's UimaUtils.java, it adds the feature org.apache.uima.stanfordnlp.input:action=ner to the CAS then send the case to UIMA server then check the org.apache.uima.stanfordnlp.output feature in the response.
private static final Joiner joiner = Joiner.on(",");
@Test
public void testStanfordNLPAnnotator() throws Exception {
AnalysisEngine ae = UimaUtils.getAE("%ABS_PATH%\StanfordNLPAnnotator.xml", null);
for (String input : INPUTS) {
JCas jcas = ae.newJCas();
addFSAction(jcas,Lists.newArrayList(StanfordNLPAnnotator.STANFORDNLP_ACTION_NER));
jcas = UimaUtils.runAE(ae, input, UimaUtils.MIMETYPE_TEXT, jcas);
Feature feature = jcas.getTypeSystem().getFeatureByFullName(
"org.apache.uima.standfordnlp.output:type");
org.apache.uima.cas.TypeSystem ts = jcas.getTypeSystem();
org.apache.uima.cas.Type dyOutputType = ts
.getType("org.apache.uima.standfordnlp.output");
FSIndex<? extends Annotation> index = jcas
.getAnnotationIndex(dyOutputType);
for (Iterator<? extends Annotation> it = index.iterator(); it
.hasNext();) {
Annotation annotation = it.next();
System.out.println("...(" + annotation.getBegin() + ","
+ annotation.getEnd() + "): "
+ annotation.getCoveredText() + ", type: "
+ annotation.getFeatureValueAsString(feature));
}
}
ae.destroy();
}
private void addFSAction(JCas jcas, List<String> action) {
TypeSystem ts = jcas.getTypeSystem();
Feature ft = ts
.getFeatureByFullName(StanfordNLPAnnotator.FS_STANFORDNLP_INPUT_ACTION);
Type type = ts.getType(StanfordNLPAnnotator.TYPE_STANFORDNLP_INPUT);
FeatureStructure fs = jcas.getCas().createFS(type);
fs.setStringValue(ft, joiner.join(action));
jcas.addFsToIndexes(fs);
}