-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathnode-ner.js
More file actions
101 lines (90 loc) · 2.65 KB
/
node-ner.js
File metadata and controls
101 lines (90 loc) · 2.65 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
var exec = require('child_process').exec;
var path = require('path');
var _ = require('underscore');
function ner(options) {
this.options = _.extend({
install_path: '',
jar: 'stanford-ner.jar',
classifier: 'english.muc.7class.distsim.crf.ser.gz',
charset: 'UTF-8'
}, options);
}
ner.prototype.fromFile = function(filename, callback) {
var scope = this;
var exec = require('child_process').exec;
exec('java -mx1500m -cp '+path.normalize(this.options.install_path+'/'+this.options.jar)+' edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier '+path.normalize(this.options.install_path+'/classifiers/'+this.options.classifier)+' -textFile '+filename + ' -encoding ' + this.options.charset,
{maxBuffer : 500 * 1024}, function(error, stdout, stderr) {
if (error) {
console.log("ERROR:", error);
return false;
}
scope.parse(stdout, callback);
});
}
ner.prototype.parse = function(parsed, callback) {
var tokenized = parsed.split(/\s/gmi);
var splitRegex = new RegExp('(.+)/([A-Z]+)','g');
var tagged = _.map(tokenized, function(token) {
var parts = new RegExp('(.+)/([A-Z]+)','g').exec(token);
if (parts) {
return {
w: parts[1],
t: parts[2]
}
}
return null;
});
tagged = _.compact(tagged);
// Now we extract the neighbors into one entity
var entities = {};
var i;
var l = tagged.length;
var prevEntity = false;
var entityBuffer = [];
for (i=0;i<l;i++) {
if (tagged[i].t != 'O') {
if (tagged[i].t != prevEntity) {
// New tag!
// Was there a buffer?
if (entityBuffer.length>0) {
// There was! We save the entity
if (!entities.hasOwnProperty(prevEntity)) {
entities[prevEntity] = [];
}
entities[prevEntity].push(entityBuffer.join(' '));
// Now we set the buffer
entityBuffer = [];
}
// Push to the buffer
entityBuffer.push(tagged[i].w);
} else {
// Prev entity is same a current one. We push to the buffer.
entityBuffer.push(tagged[i].w);
}
} else {
if (entityBuffer.length>0) {
// There was! We save the entity
if (!entities.hasOwnProperty(prevEntity)) {
entities[prevEntity] = [];
}
entities[prevEntity].push(entityBuffer.join(' '));
// Now we set the buffer
entityBuffer = [];
}
}
// Save the current entity
prevEntity = tagged[i].t;
}
// Check entityBuffer one last time to make sure we account for the last term
if (entityBuffer.length>0) {
// There was! We save the entity
if (!entities.hasOwnProperty(prevEntity)) {
entities[prevEntity] = [];
}
entities[prevEntity].push(entityBuffer.join(' '));
// Now we set the buffer
entityBuffer = [];
}
callback(entities);
}
module.exports = ner;